001: /*
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */
017: package org.apache.lenya.cms.cocoon.generation;
018:
019: import org.apache.cocoon.generation.ServiceableGenerator;
020: import org.apache.avalon.excalibur.pool.Recyclable;
021: import org.apache.avalon.framework.parameters.Parameters;
022: import org.apache.avalon.framework.configuration.Configurable;
023: import org.apache.avalon.framework.configuration.Configuration;
024: import org.apache.avalon.framework.configuration.ConfigurationException;
025: import org.apache.cocoon.ProcessingException;
026: import org.apache.cocoon.environment.ObjectModelHelper;
027: import org.apache.cocoon.environment.Request;
028: import org.apache.cocoon.environment.SourceResolver;
029: import org.apache.cocoon.Constants;
030: import org.apache.commons.lang.StringUtils;
031: import org.apache.excalibur.source.Source;
032: import org.apache.lenya.cms.publication.DocumentFactory;
033: import org.apache.lenya.cms.publication.DocumentUtil;
034: import org.apache.lenya.cms.repository.RepositoryException;
035: import org.apache.lenya.cms.repository.RepositoryUtil;
036: import org.apache.lenya.cms.repository.Session;
037: import org.apache.regexp.RE;
038: import org.apache.regexp.RESyntaxException;
039:
040: import org.xml.sax.SAXException;
041: import org.xml.sax.helpers.AttributesImpl;
042:
043: import java.io.IOException;
044: import java.io.InputStream;
045: import java.io.BufferedReader;
046: import java.io.InputStreamReader;
047: import java.net.URLConnection;
048: import java.net.HttpURLConnection;
049: import java.net.URL;
050: import java.util.Map;
051: import java.util.HashSet;
052: import java.util.Iterator;
053: import java.util.List;
054: import java.util.ArrayList;
055:
056: /**
057: *Generates a list of links that are reachable from the src and their status.
058: *
059: * <pre>
060: * <map:generator name="linkStatus" src="org.apache.lenya.cms.cocoon.generation.LinkStatusGenerator"/>
061: *
062: * <map:generate type="linkStatus" src="/{pubid}/{area}/{doc-id}.html">
063: * <map:parameter name="depth" value="1"/>
064: * </map:generate>
065: * </pre>
066: **/
067:
068: public class LinkStatusGenerator extends ServiceableGenerator implements
069: Recyclable, Configurable {
070:
071: /** The URI of the namespace of this generator. */
072: protected static final String URI = "http://apache.org/cocoon/linkstatus/2.0";
073:
074: /** The namespace prefix for this namespace. */
075: protected static final String PREFIX = "linkstatus";
076:
077: /* Node and attribute names */
078: protected static final String TOP_NODE_NAME = "linkstatus";
079: protected static final String LINK_NODE_NAME = "link";
080:
081: protected static final String HREF_ATTR_NAME = "href";
082: protected static final String REFERRER_ATTR_NAME = "referrer";
083: protected static final String CONTENT_ATTR_NAME = "content";
084: protected static final String STATUS_ATTR_NAME = "status";
085: protected static final String MESSAGE_ATTR_NAME = "message";
086:
087: protected AttributesImpl attributes;
088:
089: /**
090: * Config element name specifying expected link content-typ.
091: * <p>
092: * Its value is <code>link-content-type</code>.
093: * </p>
094: *
095: * @since
096: */
097: public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
098:
099: /**
100: * Default value of <code>link-content-type</code> configuration value.
101: * <p>
102: * Its value is <code>application/x-cocoon-links</code>.
103: * </p>
104: *
105: * @since
106: */
107: public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links";
108:
109: /**
110: * Config element name specifying query-string appendend for requesting links
111: * of an URL.
112: * <p>
113: * Its value is <code>link-view-query</code>.
114: * </p>
115: *
116: * @since
117: */
118: public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";
119: /**
120: * Default value of <code>link-view-query</code> configuration value.
121: * <p>
122: * Its value is <code>?cocoon-view=links</code>.
123: * </p>
124: *
125: * @since
126: */
127: public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
128:
129: /**
130: * Config element name specifying excluding regular expression pattern.
131: * <p>
132: * Its value is <code>exclude</code>.
133: * </p>
134: *
135: * @since
136: */
137: public final static String EXCLUDE_CONFIG = "exclude";
138:
139: /**
140: * Config element name specifying including regular expression pattern.
141: * <p>
142: * Its value is <code>include</code>.
143: * </p>
144: *
145: * @since
146: */
147: public final static String INCLUDE_CONFIG = "include";
148:
149: /**
150: * Config element name specifying http header value for user-Agent.
151: * <p>
152: * Its value is <code>user-agent</code>.
153: * </p>
154: *
155: * @since
156: */
157: public final static String USER_AGENT_CONFIG = "user-agent";
158: /**
159: * Default value of <code>user-agent</code> configuration value.
160: *
161: * @see org.apache.cocoon.Constants#COMPLETE_NAME
162: * @since
163: */
164: public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
165:
166: /**
167: * Config element name specifying http header value for accept.
168: * <p>
169: * Its value is <code>accept</code>.
170: * </p>
171: *
172: * @since
173: */
174: public final static String ACCEPT_CONFIG = "accept";
175: /**
176: * Default value of <code>accept</code> configuration value.
177: * <p>
178: * Its value is <code>* / *</code>
179: * </p>
180: *
181: * @since
182: */
183: public final static String ACCEPT_DEFAULT = "*/*";
184:
185: private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
186: private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
187: private HashSet excludeCrawlingURL;
188: private HashSet includeCrawlingURL;
189:
190: private HashSet crawled;
191: private HashSet linksToProcess;
192:
193: /** The depth parameter determines how deep the EnhancedLinkStatusGenerator should delve. */
194: protected int depth = 1;
195:
196: protected Source inputSource;
197: String src;
198: private DocumentFactory identityMap;
199:
200: /**
201: * Stores links to process and the referrer links
202: */
203: private static class Link {
204: private String uri;
205: private String referrer;
206: private int linkDepth;
207:
208: public Link(String uri, String referrer, int linkDepth) {
209: this .uri = uri;
210: this .referrer = referrer;
211: this .linkDepth = linkDepth;
212: }
213:
214: public String getURI() {
215: return uri;
216: }
217:
218: public String getReferrer() {
219: return referrer;
220: }
221:
222: public int getDepth() {
223: return linkDepth;
224: }
225:
226: public boolean equals(Link l) {
227: return uri.equals(l.getURI());
228: }
229: }
230:
231: /**
232: * Configure the crawler component.
233: * <p>
234: * Configure can specify which URI to include, and which URI to exclude
235: * from crawling. You specify the patterns as regular expressions.
236: * </p>
237: * <p>
238: * Morover you can configure
239: * the required content-type of crawling request, and the
240: * query-string appended to each crawling request.
241: * </p>
242: * <pre><tt>
243: * <include>.*\.html?</include> or <include>.*\.html?, .*\.xsp</include>
244: * <exclude>.*\.gif</exclude> or <exclude>.*\.gif, .*\.jpe?g</exclude>
245: * <link-content-type> application/x-cocoon-links </link-content-type>
246: * <link-view-query> ?cocoon-view=links </link-view-query>
247: * <user-agent> Cocoon </user-agent>
248: * <accept> text/xml </accept>
249: * </tt></pre>
250: *
251: * @param configuration XML configuration of this avalon component.
252: * @exception ConfigurationException is throwing if configuration is invalid.
253: * @since
254: */
255: public void configure(Configuration configuration)
256: throws ConfigurationException {
257:
258: Configuration[] children;
259: children = configuration.getChildren(INCLUDE_CONFIG);
260: if (children.length > 0) {
261: includeCrawlingURL = new HashSet();
262: for (int i = 0; i < children.length; i++) {
263: String pattern = children[i].getValue();
264: try {
265: String params[] = StringUtils.split(pattern, ", ");
266: for (int index = 0; index < params.length; index++) {
267: String tokenized_pattern = params[index];
268: this .includeCrawlingURL.add(new RE(
269: tokenized_pattern));
270: }
271: } catch (RESyntaxException rese) {
272: getLogger().error(
273: "Cannot create including regular-expression for "
274: + pattern, rese);
275: }
276: }
277: }
278:
279: children = configuration.getChildren(EXCLUDE_CONFIG);
280: if (children.length > 0) {
281: excludeCrawlingURL = new HashSet();
282: for (int i = 0; i < children.length; i++) {
283: String pattern = children[i].getValue();
284: try {
285: String params[] = StringUtils.split(pattern, ", ");
286: for (int index = 0; index < params.length; index++) {
287: String tokenized_pattern = params[index];
288: this .excludeCrawlingURL.add(new RE(
289: tokenized_pattern));
290: }
291: } catch (RESyntaxException rese) {
292: getLogger().error(
293: "Cannot create excluding regular-expression for "
294: + pattern, rese);
295: }
296: }
297: } else {
298: excludeCrawlingURL = new HashSet();
299: setDefaultExcludeFromCrawling();
300: }
301:
302: Configuration child;
303: String value;
304: child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
305: if (child != null) {
306: value = child.getValue();
307: if (value != null && value.length() > 0) {
308: this .linkContentType = value.trim();
309: }
310: }
311: child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
312: if (child != null) {
313: value = child.getValue();
314: if (value != null && value.length() > 0) {
315: this .linkViewQuery = value.trim();
316: }
317: }
318: }
319:
320: public void setup(SourceResolver resolver, Map objectModel,
321: String src, Parameters par) throws ProcessingException,
322: SAXException, IOException {
323:
324: Request request = ObjectModelHelper.getRequest(objectModel);
325: Session session;
326: try {
327: session = RepositoryUtil.getSession(this .manager, request);
328: } catch (RepositoryException e) {
329: throw new ProcessingException(e);
330: }
331: this .identityMap = DocumentUtil.createDocumentFactory(
332: this .manager, session);
333:
334: super .setup(resolver, objectModel, src, par);
335: this .src = src;
336: this .depth = par.getParameterAsInteger("depth", 1);
337:
338: /* Create a reusable attributes for creating nodes */
339: this .attributes = new AttributesImpl();
340: }
341:
342: /**
343: * Generate XML data.
344: *
345: * @throws SAXException
346: * if an error occurs while outputting the document
347: * @throws ProcessingException
348: * if the requsted URI wasn't found
349: */
350: public void generate() throws SAXException, ProcessingException {
351:
352: crawled = new HashSet();
353: linksToProcess = new HashSet();
354:
355: //this first node should be handled as a cocoon source
356: String root = this .src;
357: URL tempurl = null;
358: linksToProcess.add(new Link(root, "", 0));
359:
360: if (getLogger().isDebugEnabled()) {
361: getLogger().debug("crawl URL " + root);
362: }
363:
364: this .contentHandler.startDocument();
365: this .contentHandler.startPrefixMapping(PREFIX, URI);
366:
367: attributes.clear();
368: super .contentHandler.startElement(URI, TOP_NODE_NAME, PREFIX
369: + ':' + TOP_NODE_NAME, attributes);
370:
371: while (linksToProcess.size() > 0) {
372: Iterator i = linksToProcess.iterator();
373:
374: if (i.hasNext()) {
375: // fetch a URL
376: Link link = (Link) i.next();
377: String uri = link.getURI();
378: int referrerDepth = link.getDepth();
379: // remove it from the to-do list
380: linksToProcess.remove(link);
381: String new_url_link = processURL(uri, link
382: .getReferrer(), referrerDepth);
383:
384: // calc all links from this url
385: if (new_url_link != null && referrerDepth < this .depth) {
386:
387: List url_links = getLinksFromConnection(
388: new_url_link, uri, referrerDepth);
389: if (url_links != null) {
390: // add links of this url to the to-do list
391: linksToProcess.addAll(url_links);
392: }
393: }
394: }
395: }
396:
397: super .contentHandler.endElement(URI, TOP_NODE_NAME, PREFIX
398: + ':' + TOP_NODE_NAME);
399: this .contentHandler.endPrefixMapping(PREFIX);
400: this .contentHandler.endDocument();
401: }
402:
403: /**
404: * Default exclude patterns.
405: * <p>
406: * By default URLs matching following patterns are excluded:
407: * </p>
408: * <ul>
409: * <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
410: * <li>.*\\.png(\\?.*)?$ - exclude png images</li>
411: * <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
412: * <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
413: * <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
414: * </ul>
415: *
416: * @since
417: */
418: private void setDefaultExcludeFromCrawling() {
419: String[] EXCLUDE_FROM_CRAWLING_DEFAULT = { ".*\\.gif(\\?.*)?$",
420: ".*\\.png(\\?.*)?$", ".*\\.jpe?g(\\?.*)?$",
421: ".*\\.js(\\?.*)?$", ".*\\.css(\\?.*)?$", ".*\\?.*",
422: ".*\\@.*" };
423:
424: for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
425: String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
426: try {
427: excludeCrawlingURL.add(new RE(pattern));
428: } catch (RESyntaxException rese) {
429: getLogger().error(
430: "Cannot create excluding regular-expression for "
431: + pattern, rese);
432: }
433: }
434: }
435:
436: /**
437: * Retrieve a list of links of a url
438: *
439: * @param url_link_string url for requesting links, it is assumed that
440: * url_link_string queries the cocoon view links, ie of the form
441: * <code>http://host/foo/bar?cocoon-view=links</code>
442: * @param url_of_referrer base url of which links are requested, ie of the form
443: * <code>http://host/foo/bar</code>
444: * @return List of links from url_of_referrer, as result of requesting url
445: * url_link_string
446: */
447: protected List getLinksFromConnection(String url_link_string,
448: String url_of_referrer, int referrerDepth) {
449: List url_links = null;
450: BufferedReader br = null;
451: try {
452:
453: url_links = new ArrayList();
454: url_link_string = "cocoon:/" + url_link_string;
455:
456: inputSource = super .resolver.resolveURI(url_link_string);
457: InputStream is = inputSource.getInputStream();
458: br = new BufferedReader(new InputStreamReader(is));
459:
460: // content is supposed to be a list of links,
461: // relative to current URL
462: String line;
463: String referrer = url_of_referrer;
464:
465: while ((line = br.readLine()) != null) {
466: String new_url = line;
467: boolean add_url = true;
468: // don't add new_url twice
469: if (add_url) {
470: add_url &= !url_links.contains(new_url);
471: }
472:
473: // don't add new_url if it has been crawled already
474: if (add_url) {
475: add_url &= !crawled.contains(new_url);
476: }
477:
478: Link new_link = new Link(line, referrer,
479: referrerDepth + 1);
480: if (add_url) {
481: add_url &= !linksToProcess.contains(new_link);
482: }
483:
484: // don't add if is not matched by existing include definition
485: if (add_url) {
486: add_url &= isIncludedURL(new_url);
487: }
488:
489: //don't add id matched by existing exclude definition
490: if (add_url) {
491: add_url &= !(isExcludedURL(new_url));
492: }
493:
494: if (add_url) {
495: if (getLogger().isDebugEnabled()) {
496: getLogger().debug("Add URL: " + new_url);
497: }
498: url_links.add(new_link);
499: }
500: }
501: // now we have a list of URL which should be examined
502:
503: } catch (IOException ioe) {
504: getLogger().warn(
505: "Problems get links of " + url_link_string, ioe);
506: } finally {
507: // explictly close the stream
508: if (br != null) {
509: try {
510: br.close();
511: br = null;
512: } catch (IOException ignored) {
513: }
514: }
515: }
516: return url_links;
517: }
518:
519: /**
520: * Generate xml attributes of a url, calculate url for retrieving links
521: *
522: * @param url to process
523: * @param referrer of the url
524: * @return String url for retrieving links, or null if url is an excluded-url,
525: * and not an included-url.
526: */
527: protected String processURL(String uri, String referrer,
528: int referrerDepth) throws SAXException {
529:
530: if (getLogger().isDebugEnabled()) {
531: getLogger().debug("getLinks URL " + uri);
532: }
533:
534: String result = null;
535:
536: // don't try to investigate a url which has been crawled already
537: if (crawled.contains(uri)) {
538: return null;
539: }
540:
541: //TODO: need to respect robots.txt
542:
543: // mark it as crawled
544: crawled.add(uri);
545:
546: attributes.clear();
547: attributes.addAttribute("", HREF_ATTR_NAME, HREF_ATTR_NAME,
548: "CDATA", uri);
549: attributes.addAttribute("", REFERRER_ATTR_NAME,
550: REFERRER_ATTR_NAME, "CDATA", referrer);
551:
552: // Output url, referrer, content-type, status, message for traversable url's
553: HttpURLConnection h = null;
554: URL url = null;
555: String newURL = null;
556: try {
557: String content_type = "text/html";
558: String responseMessage = "not found";
559: int responseCode = 404;
560: if (uri.startsWith("http://")) {
561: url = new URL(uri);
562: URLConnection links_url_connection = url
563: .openConnection();
564: h = (HttpURLConnection) links_url_connection;
565: h.setRequestMethod("HEAD"); //lets be kind to external sites
566: content_type = links_url_connection.getContentType();
567: responseMessage = h.getResponseMessage();
568: responseCode = h.getResponseCode();
569: } else {
570: String tempURI = uri;
571: if (!(uri.startsWith("/"))) {
572: String contextURI = referrer.substring(0, referrer
573: .lastIndexOf("/") + 1);
574: tempURI = contextURI + uri;
575: }
576:
577: //see if the document exists
578: if (this .identityMap.isDocument(tempURI)) {
579: content_type = "text/html";
580: responseMessage = "ok";
581: responseCode = 200;
582: newURL = tempURI;
583: } else {
584: //see if the resource exists
585: }
586: }
587:
588: attributes.addAttribute("", CONTENT_ATTR_NAME,
589: CONTENT_ATTR_NAME, "CDATA", content_type);
590:
591: attributes.addAttribute("", MESSAGE_ATTR_NAME,
592: MESSAGE_ATTR_NAME, "CDATA", responseMessage);
593:
594: attributes.addAttribute("", STATUS_ATTR_NAME,
595: STATUS_ATTR_NAME, "CDATA", String
596: .valueOf(responseCode));
597: } catch (IOException ioe) {
598: attributes.addAttribute("", MESSAGE_ATTR_NAME,
599: MESSAGE_ATTR_NAME, "CDATA", ioe.getMessage());
600: } catch (final Exception e1) {
601: attributes.addAttribute("", MESSAGE_ATTR_NAME,
602: MESSAGE_ATTR_NAME, "CDATA", e1.getMessage());
603: } finally {
604: if (h != null) {
605: h.disconnect();
606: }
607: }
608:
609: // don't try to get links of a url which is excluded from crawling
610: // try to get links of a url which is included for crawling
611: if (!isExcludedURL(uri) && isIncludedURL(uri)) {
612: // add prefix and query to get data from the linkserializer.
613: if (newURL != null) {
614: if (newURL.indexOf("?") > -1) {
615: newURL = newURL.substring(0, newURL.indexOf("?"))
616: + linkViewQuery;
617: } else {
618: newURL = newURL + "?" + linkViewQuery;
619: }
620: }
621: }
622:
623: //linkrewriter transformer takes care of internal links
624: if (uri.startsWith("http://")) {
625: super .contentHandler.startElement(URI, LINK_NODE_NAME,
626: PREFIX + ':' + LINK_NODE_NAME, attributes);
627: super .contentHandler.endElement(URI, LINK_NODE_NAME, PREFIX
628: + ':' + LINK_NODE_NAME);
629: }
630:
631: return newURL;
632: }
633:
634: /**
635: * check if URL is a candidate for indexing
636: *
637: * @param url Description of Parameter
638: * @return The excludedURL value
639: * @since
640: */
641: private boolean isExcludedURL(String url) {
642: // by default include URL for crawling
643: if (excludeCrawlingURL == null) {
644: if (getLogger().isDebugEnabled()) {
645: getLogger().debug("exclude no URL " + url);
646: }
647: return false;
648: }
649:
650: final String s = url;
651: Iterator i = excludeCrawlingURL.iterator();
652: while (i.hasNext()) {
653: RE pattern = (RE) i.next();
654: if (pattern.match(s)) {
655: if (getLogger().isDebugEnabled()) {
656: getLogger().debug("exclude URL " + url);
657: }
658: return true;
659: }
660: }
661: if (getLogger().isDebugEnabled()) {
662: getLogger().debug("exclude not URL " + url);
663: }
664: return false;
665: }
666:
667: /**
668: * check if URL is a candidate for indexing
669: *
670: * @param url Description of Parameter
671: * @return The includedURL value
672: * @since
673: */
674: private boolean isIncludedURL(String url) {
675: // by default include URL for crawling
676: if (includeCrawlingURL == null) {
677: if (getLogger().isDebugEnabled()) {
678: getLogger().debug("include all URL " + url);
679: }
680: return true;
681: }
682:
683: final String s = url;
684: Iterator i = includeCrawlingURL.iterator();
685: while (i.hasNext()) {
686: RE pattern = (RE) i.next();
687: if (pattern.match(s)) {
688: if (getLogger().isDebugEnabled()) {
689: getLogger().debug("include URL " + url);
690: }
691: return true;
692: }
693: }
694: if (getLogger().isDebugEnabled()) {
695: getLogger().debug("include not URL " + url);
696: }
697: return false;
698: }
699:
700: public void recycle() {
701: if (null != this.inputSource) {
702: super.resolver.release(this.inputSource);
703: this.inputSource = null;
704: }
705: this.manager.release(super.resolver);
706: super.resolver = null;
707: this.manager = null;
708: this.attributes = null;
709: super.recycle();
710: }
711: }
|