Source Code Cross Referenced for LinkStatusGenerator.java in » Content-Management-System » apache-lenya-2.0 » org » apache » lenya » cms » cocoon » generation » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Content Management System » apache lenya 2.0 » org.apache.lenya.cms.cocoon.generation
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         * Licensed to the Apache Software Foundation (ASF) under one or more
003:         * contributor license agreements.  See the NOTICE file distributed with
004:         * this work for additional information regarding copyright ownership.
005:         * The ASF licenses this file to You under the Apache License, Version 2.0
006:         * (the "License"); you may not use this file except in compliance with
007:         * the License.  You may obtain a copy of the License at
008:         *
009:         *      http://www.apache.org/licenses/LICENSE-2.0
010:         *
011:         * Unless required by applicable law or agreed to in writing, software
012:         * distributed under the License is distributed on an "AS IS" BASIS,
013:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014:         * See the License for the specific language governing permissions and
015:         * limitations under the License.
016:         */
017:        package org.apache.lenya.cms.cocoon.generation;
018:
019:        import org.apache.cocoon.generation.ServiceableGenerator;
020:        import org.apache.avalon.excalibur.pool.Recyclable;
021:        import org.apache.avalon.framework.parameters.Parameters;
022:        import org.apache.avalon.framework.configuration.Configurable;
023:        import org.apache.avalon.framework.configuration.Configuration;
024:        import org.apache.avalon.framework.configuration.ConfigurationException;
025:        import org.apache.cocoon.ProcessingException;
026:        import org.apache.cocoon.environment.ObjectModelHelper;
027:        import org.apache.cocoon.environment.Request;
028:        import org.apache.cocoon.environment.SourceResolver;
029:        import org.apache.cocoon.Constants;
030:        import org.apache.commons.lang.StringUtils;
031:        import org.apache.excalibur.source.Source;
032:        import org.apache.lenya.cms.publication.DocumentFactory;
033:        import org.apache.lenya.cms.publication.DocumentUtil;
034:        import org.apache.lenya.cms.repository.RepositoryException;
035:        import org.apache.lenya.cms.repository.RepositoryUtil;
036:        import org.apache.lenya.cms.repository.Session;
037:        import org.apache.regexp.RE;
038:        import org.apache.regexp.RESyntaxException;
039:
040:        import org.xml.sax.SAXException;
041:        import org.xml.sax.helpers.AttributesImpl;
042:
043:        import java.io.IOException;
044:        import java.io.InputStream;
045:        import java.io.BufferedReader;
046:        import java.io.InputStreamReader;
047:        import java.net.URLConnection;
048:        import java.net.HttpURLConnection;
049:        import java.net.URL;
050:        import java.util.Map;
051:        import java.util.HashSet;
052:        import java.util.Iterator;
053:        import java.util.List;
054:        import java.util.ArrayList;
055:
056:        /**
057:         *Generates a list of links that are reachable from the src and their status.
058:         *
059:         * <pre>
060:         *  &lt;map:generator name="linkStatus" src="org.apache.lenya.cms.cocoon.generation.LinkStatusGenerator"/&gt;
061:         *
062:         *   &lt;map:generate type="linkStatus" src="/{pubid}/{area}/{doc-id}.html"&gt;
063:         *      &lt;map:parameter name="depth" value="1"/&gt;
064:         *   &lt;/map:generate&gt;
065:         * </pre>
066:         **/
067:
068:        public class LinkStatusGenerator extends ServiceableGenerator implements 
069:                Recyclable, Configurable {
070:
071:            /** The URI of the namespace of this generator. */
072:            protected static final String URI = "http://apache.org/cocoon/linkstatus/2.0";
073:
074:            /** The namespace prefix for this namespace. */
075:            protected static final String PREFIX = "linkstatus";
076:
077:            /* Node and attribute names */
078:            protected static final String TOP_NODE_NAME = "linkstatus";
079:            protected static final String LINK_NODE_NAME = "link";
080:
081:            protected static final String HREF_ATTR_NAME = "href";
082:            protected static final String REFERRER_ATTR_NAME = "referrer";
083:            protected static final String CONTENT_ATTR_NAME = "content";
084:            protected static final String STATUS_ATTR_NAME = "status";
085:            protected static final String MESSAGE_ATTR_NAME = "message";
086:
087:            protected AttributesImpl attributes;
088:
089:            /**
090:             * Config element name specifying expected link content-typ.
091:             * <p>
092:             *   Its value is <code>link-content-type</code>.
093:             * </p>
094:             *
095:             * @since
096:             */
097:            public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
098:
099:            /**
100:             * Default value of <code>link-content-type</code> configuration value.
101:             * <p>
102:             *   Its value is <code>application/x-cocoon-links</code>.
103:             * </p>
104:             *
105:             * @since
106:             */
107:            public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links";
108:
109:            /**
110:             * Config element name specifying query-string appendend for requesting links
111:             * of an URL.
112:             * <p>
113:             *  Its value is <code>link-view-query</code>.
114:             * </p>
115:             *
116:             * @since
117:             */
118:            public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";
119:            /**
120:             * Default value of <code>link-view-query</code> configuration value.
121:             * <p>
122:             *   Its value is <code>?cocoon-view=links</code>.
123:             * </p>
124:             *
125:             * @since
126:             */
127:            public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
128:
129:            /**
130:             * Config element name specifying excluding regular expression pattern.
131:             * <p>
132:             *  Its value is <code>exclude</code>.
133:             * </p>
134:             *
135:             * @since
136:             */
137:            public final static String EXCLUDE_CONFIG = "exclude";
138:
139:            /**
140:             * Config element name specifying including regular expression pattern.
141:             * <p>
142:             *  Its value is <code>include</code>.
143:             * </p>
144:             *
145:             * @since
146:             */
147:            public final static String INCLUDE_CONFIG = "include";
148:
149:            /**
150:             * Config element name specifying http header value for user-Agent.
151:             * <p>
152:             *  Its value is <code>user-agent</code>.
153:             * </p>
154:             *
155:             * @since
156:             */
157:            public final static String USER_AGENT_CONFIG = "user-agent";
158:            /**
159:             * Default value of <code>user-agent</code> configuration value.
160:             *
161:             * @see org.apache.cocoon.Constants#COMPLETE_NAME
162:             * @since
163:             */
164:            public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
165:
166:            /**
167:             * Config element name specifying http header value for accept.
168:             * <p>
169:             *  Its value is <code>accept</code>.
170:             * </p>
171:             *
172:             * @since
173:             */
174:            public final static String ACCEPT_CONFIG = "accept";
175:            /**
176:             * Default value of <code>accept</code> configuration value.
177:             * <p>
178:             *   Its value is <code>* / *</code>
179:             * </p>
180:             *
181:             * @since
182:             */
183:            public final static String ACCEPT_DEFAULT = "*/*";
184:
185:            private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
186:            private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
187:            private HashSet excludeCrawlingURL;
188:            private HashSet includeCrawlingURL;
189:
190:            private HashSet crawled;
191:            private HashSet linksToProcess;
192:
193:            /** The depth parameter determines how deep the EnhancedLinkStatusGenerator should delve. */
194:            protected int depth = 1;
195:
196:            protected Source inputSource;
197:            String src;
198:            private DocumentFactory identityMap;
199:
200:            /**
201:             * Stores links to process and the referrer links
202:             */
203:            private static class Link {
204:                private String uri;
205:                private String referrer;
206:                private int linkDepth;
207:
208:                public Link(String uri, String referrer, int linkDepth) {
209:                    this .uri = uri;
210:                    this .referrer = referrer;
211:                    this .linkDepth = linkDepth;
212:                }
213:
214:                public String getURI() {
215:                    return uri;
216:                }
217:
218:                public String getReferrer() {
219:                    return referrer;
220:                }
221:
222:                public int getDepth() {
223:                    return linkDepth;
224:                }
225:
226:                public boolean equals(Link l) {
227:                    return uri.equals(l.getURI());
228:                }
229:            }
230:
231:            /**
232:             * Configure the crawler component.
233:             * <p>
234:             *  Configure can specify which URI to include, and which URI to exclude
235:             *  from crawling. You specify the patterns as regular expressions.
236:             * </p>
237:             * <p>
238:             *  Morover you can configure
239:             *  the required content-type of crawling request, and the
240:             *  query-string appended to each crawling request.
241:             * </p>
242:             * <pre><tt>
243:             * &lt;include&gt;.*\.html?&lt;/include&gt; or &lt;include&gt;.*\.html?, .*\.xsp&lt;/include&gt;
244:             * &lt;exclude&gt;.*\.gif&lt;/exclude&gt; or &lt;exclude&gt;.*\.gif, .*\.jpe?g&lt;/exclude&gt;
245:             * &lt;link-content-type&gt; application/x-cocoon-links &lt;/link-content-type&gt;
246:             * &lt;link-view-query&gt; ?cocoon-view=links &lt;/link-view-query&gt;
247:             * &lt;user-agent&gt; Cocoon &lt;/user-agent&gt;
248:             * &lt;accept&gt; text/xml &lt;/accept&gt;
249:             * </tt></pre>
250:             *
251:             * @param  configuration               XML configuration of this avalon component.
252:             * @exception  ConfigurationException  is throwing if configuration is invalid.
253:             * @since
254:             */
255:            public void configure(Configuration configuration)
256:                    throws ConfigurationException {
257:
258:                Configuration[] children;
259:                children = configuration.getChildren(INCLUDE_CONFIG);
260:                if (children.length > 0) {
261:                    includeCrawlingURL = new HashSet();
262:                    for (int i = 0; i < children.length; i++) {
263:                        String pattern = children[i].getValue();
264:                        try {
265:                            String params[] = StringUtils.split(pattern, ", ");
266:                            for (int index = 0; index < params.length; index++) {
267:                                String tokenized_pattern = params[index];
268:                                this .includeCrawlingURL.add(new RE(
269:                                        tokenized_pattern));
270:                            }
271:                        } catch (RESyntaxException rese) {
272:                            getLogger().error(
273:                                    "Cannot create including regular-expression for "
274:                                            + pattern, rese);
275:                        }
276:                    }
277:                }
278:
279:                children = configuration.getChildren(EXCLUDE_CONFIG);
280:                if (children.length > 0) {
281:                    excludeCrawlingURL = new HashSet();
282:                    for (int i = 0; i < children.length; i++) {
283:                        String pattern = children[i].getValue();
284:                        try {
285:                            String params[] = StringUtils.split(pattern, ", ");
286:                            for (int index = 0; index < params.length; index++) {
287:                                String tokenized_pattern = params[index];
288:                                this .excludeCrawlingURL.add(new RE(
289:                                        tokenized_pattern));
290:                            }
291:                        } catch (RESyntaxException rese) {
292:                            getLogger().error(
293:                                    "Cannot create excluding regular-expression for "
294:                                            + pattern, rese);
295:                        }
296:                    }
297:                } else {
298:                    excludeCrawlingURL = new HashSet();
299:                    setDefaultExcludeFromCrawling();
300:                }
301:
302:                Configuration child;
303:                String value;
304:                child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
305:                if (child != null) {
306:                    value = child.getValue();
307:                    if (value != null && value.length() > 0) {
308:                        this .linkContentType = value.trim();
309:                    }
310:                }
311:                child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
312:                if (child != null) {
313:                    value = child.getValue();
314:                    if (value != null && value.length() > 0) {
315:                        this .linkViewQuery = value.trim();
316:                    }
317:                }
318:            }
319:
320:            public void setup(SourceResolver resolver, Map objectModel,
321:                    String src, Parameters par) throws ProcessingException,
322:                    SAXException, IOException {
323:
324:                Request request = ObjectModelHelper.getRequest(objectModel);
325:                Session session;
326:                try {
327:                    session = RepositoryUtil.getSession(this .manager, request);
328:                } catch (RepositoryException e) {
329:                    throw new ProcessingException(e);
330:                }
331:                this .identityMap = DocumentUtil.createDocumentFactory(
332:                        this .manager, session);
333:
334:                super .setup(resolver, objectModel, src, par);
335:                this .src = src;
336:                this .depth = par.getParameterAsInteger("depth", 1);
337:
338:                /* Create a reusable attributes for creating nodes */
339:                this .attributes = new AttributesImpl();
340:            }
341:
342:            /**
343:             * Generate XML data.
344:             *
345:             * @throws  SAXException
346:             *      if an error occurs while outputting the document
347:             * @throws  ProcessingException
348:             *      if the requsted URI wasn't found
349:             */
350:            public void generate() throws SAXException, ProcessingException {
351:
352:                crawled = new HashSet();
353:                linksToProcess = new HashSet();
354:
355:                //this first node should be handled as a cocoon source
356:                String root = this .src;
357:                URL tempurl = null;
358:                linksToProcess.add(new Link(root, "", 0));
359:
360:                if (getLogger().isDebugEnabled()) {
361:                    getLogger().debug("crawl URL " + root);
362:                }
363:
364:                this .contentHandler.startDocument();
365:                this .contentHandler.startPrefixMapping(PREFIX, URI);
366:
367:                attributes.clear();
368:                super .contentHandler.startElement(URI, TOP_NODE_NAME, PREFIX
369:                        + ':' + TOP_NODE_NAME, attributes);
370:
371:                while (linksToProcess.size() > 0) {
372:                    Iterator i = linksToProcess.iterator();
373:
374:                    if (i.hasNext()) {
375:                        // fetch a URL
376:                        Link link = (Link) i.next();
377:                        String uri = link.getURI();
378:                        int referrerDepth = link.getDepth();
379:                        // remove it from the to-do list
380:                        linksToProcess.remove(link);
381:                        String new_url_link = processURL(uri, link
382:                                .getReferrer(), referrerDepth);
383:
384:                        // calc all links from this url
385:                        if (new_url_link != null && referrerDepth < this .depth) {
386:
387:                            List url_links = getLinksFromConnection(
388:                                    new_url_link, uri, referrerDepth);
389:                            if (url_links != null) {
390:                                // add links of this url to the to-do list
391:                                linksToProcess.addAll(url_links);
392:                            }
393:                        }
394:                    }
395:                }
396:
397:                super .contentHandler.endElement(URI, TOP_NODE_NAME, PREFIX
398:                        + ':' + TOP_NODE_NAME);
399:                this .contentHandler.endPrefixMapping(PREFIX);
400:                this .contentHandler.endDocument();
401:            }
402:
403:            /**
404:             * Default exclude patterns.
405:             * <p>
406:             *   By default URLs matching following patterns are excluded:
407:             * </p>
408:             * <ul>
409:             *   <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
410:             *   <li>.*\\.png(\\?.*)?$ - exclude png images</li>
411:             *   <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
412:             *   <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
413:             *   <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
414:             * </ul>
415:             *
416:             * @since
417:             */
418:            private void setDefaultExcludeFromCrawling() {
419:                String[] EXCLUDE_FROM_CRAWLING_DEFAULT = { ".*\\.gif(\\?.*)?$",
420:                        ".*\\.png(\\?.*)?$", ".*\\.jpe?g(\\?.*)?$",
421:                        ".*\\.js(\\?.*)?$", ".*\\.css(\\?.*)?$", ".*\\?.*",
422:                        ".*\\@.*" };
423:
424:                for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
425:                    String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
426:                    try {
427:                        excludeCrawlingURL.add(new RE(pattern));
428:                    } catch (RESyntaxException rese) {
429:                        getLogger().error(
430:                                "Cannot create excluding regular-expression for "
431:                                        + pattern, rese);
432:                    }
433:                }
434:            }
435:
436:            /**
437:             * Retrieve a list of links of a url
438:             *
439:             * @param url_link_string url for requesting links, it is assumed that
440:             *   url_link_string queries the cocoon view links, ie of the form
441:             *   <code>http://host/foo/bar?cocoon-view=links</code>
442:             * @param url_of_referrer base url of which links are requested, ie of the form
443:             *   <code>http://host/foo/bar</code>
444:             * @return List of links from url_of_referrer, as result of requesting url
445:             *   url_link_string
446:             */
447:            protected List getLinksFromConnection(String url_link_string,
448:                    String url_of_referrer, int referrerDepth) {
449:                List url_links = null;
450:                BufferedReader br = null;
451:                try {
452:
453:                    url_links = new ArrayList();
454:                    url_link_string = "cocoon:/" + url_link_string;
455:
456:                    inputSource = super .resolver.resolveURI(url_link_string);
457:                    InputStream is = inputSource.getInputStream();
458:                    br = new BufferedReader(new InputStreamReader(is));
459:
460:                    // content is supposed to be a list of links,
461:                    // relative to current URL
462:                    String line;
463:                    String referrer = url_of_referrer;
464:
465:                    while ((line = br.readLine()) != null) {
466:                        String new_url = line;
467:                        boolean add_url = true;
468:                        // don't add new_url twice
469:                        if (add_url) {
470:                            add_url &= !url_links.contains(new_url);
471:                        }
472:
473:                        // don't add new_url if it has been crawled already
474:                        if (add_url) {
475:                            add_url &= !crawled.contains(new_url);
476:                        }
477:
478:                        Link new_link = new Link(line, referrer,
479:                                referrerDepth + 1);
480:                        if (add_url) {
481:                            add_url &= !linksToProcess.contains(new_link);
482:                        }
483:
484:                        // don't add if is not matched by existing include definition
485:                        if (add_url) {
486:                            add_url &= isIncludedURL(new_url);
487:                        }
488:
489:                        //don't add id matched by existing exclude definition
490:                        if (add_url) {
491:                            add_url &= !(isExcludedURL(new_url));
492:                        }
493:
494:                        if (add_url) {
495:                            if (getLogger().isDebugEnabled()) {
496:                                getLogger().debug("Add URL: " + new_url);
497:                            }
498:                            url_links.add(new_link);
499:                        }
500:                    }
501:                    // now we have a list of URL which should be examined
502:
503:                } catch (IOException ioe) {
504:                    getLogger().warn(
505:                            "Problems get links of " + url_link_string, ioe);
506:                } finally {
507:                    // explictly close the stream
508:                    if (br != null) {
509:                        try {
510:                            br.close();
511:                            br = null;
512:                        } catch (IOException ignored) {
513:                        }
514:                    }
515:                }
516:                return url_links;
517:            }
518:
519:            /**
520:             * Generate xml attributes of a url, calculate url for retrieving links
521:             *
522:             * @param url to process
523:             * @param referrer of the url
524:             * @return String url for retrieving links, or null if url is an excluded-url,
525:             *   and not an included-url.
526:             */
527:            protected String processURL(String uri, String referrer,
528:                    int referrerDepth) throws SAXException {
529:
530:                if (getLogger().isDebugEnabled()) {
531:                    getLogger().debug("getLinks URL " + uri);
532:                }
533:
534:                String result = null;
535:
536:                // don't try to investigate a url which has been crawled already
537:                if (crawled.contains(uri)) {
538:                    return null;
539:                }
540:
541:                //TODO: need to respect robots.txt
542:
543:                // mark it as crawled
544:                crawled.add(uri);
545:
546:                attributes.clear();
547:                attributes.addAttribute("", HREF_ATTR_NAME, HREF_ATTR_NAME,
548:                        "CDATA", uri);
549:                attributes.addAttribute("", REFERRER_ATTR_NAME,
550:                        REFERRER_ATTR_NAME, "CDATA", referrer);
551:
552:                // Output url, referrer, content-type, status, message for traversable url's
553:                HttpURLConnection h = null;
554:                URL url = null;
555:                String newURL = null;
556:                try {
557:                    String content_type = "text/html";
558:                    String responseMessage = "not found";
559:                    int responseCode = 404;
560:                    if (uri.startsWith("http://")) {
561:                        url = new URL(uri);
562:                        URLConnection links_url_connection = url
563:                                .openConnection();
564:                        h = (HttpURLConnection) links_url_connection;
565:                        h.setRequestMethod("HEAD"); //lets be kind to external sites
566:                        content_type = links_url_connection.getContentType();
567:                        responseMessage = h.getResponseMessage();
568:                        responseCode = h.getResponseCode();
569:                    } else {
570:                        String tempURI = uri;
571:                        if (!(uri.startsWith("/"))) {
572:                            String contextURI = referrer.substring(0, referrer
573:                                    .lastIndexOf("/") + 1);
574:                            tempURI = contextURI + uri;
575:                        }
576:
577:                        //see if the document exists
578:                        if (this .identityMap.isDocument(tempURI)) {
579:                            content_type = "text/html";
580:                            responseMessage = "ok";
581:                            responseCode = 200;
582:                            newURL = tempURI;
583:                        } else {
584:                            //see if the resource exists
585:                        }
586:                    }
587:
588:                    attributes.addAttribute("", CONTENT_ATTR_NAME,
589:                            CONTENT_ATTR_NAME, "CDATA", content_type);
590:
591:                    attributes.addAttribute("", MESSAGE_ATTR_NAME,
592:                            MESSAGE_ATTR_NAME, "CDATA", responseMessage);
593:
594:                    attributes.addAttribute("", STATUS_ATTR_NAME,
595:                            STATUS_ATTR_NAME, "CDATA", String
596:                                    .valueOf(responseCode));
597:                } catch (IOException ioe) {
598:                    attributes.addAttribute("", MESSAGE_ATTR_NAME,
599:                            MESSAGE_ATTR_NAME, "CDATA", ioe.getMessage());
600:                } catch (final Exception e1) {
601:                    attributes.addAttribute("", MESSAGE_ATTR_NAME,
602:                            MESSAGE_ATTR_NAME, "CDATA", e1.getMessage());
603:                } finally {
604:                    if (h != null) {
605:                        h.disconnect();
606:                    }
607:                }
608:
609:                // don't try to get links of a url which is excluded from crawling
610:                // try to get links of a url which is included for crawling
611:                if (!isExcludedURL(uri) && isIncludedURL(uri)) {
612:                    // add prefix and query to get data from the linkserializer.
613:                    if (newURL != null) {
614:                        if (newURL.indexOf("?") > -1) {
615:                            newURL = newURL.substring(0, newURL.indexOf("?"))
616:                                    + linkViewQuery;
617:                        } else {
618:                            newURL = newURL + "?" + linkViewQuery;
619:                        }
620:                    }
621:                }
622:
623:                //linkrewriter transformer takes care of internal links
624:                if (uri.startsWith("http://")) {
625:                    super .contentHandler.startElement(URI, LINK_NODE_NAME,
626:                            PREFIX + ':' + LINK_NODE_NAME, attributes);
627:                    super .contentHandler.endElement(URI, LINK_NODE_NAME, PREFIX
628:                            + ':' + LINK_NODE_NAME);
629:                }
630:
631:                return newURL;
632:            }
633:
634:            /**
635:             * check if URL is a candidate for indexing
636:             *
637:             * @param  url  Description of Parameter
638:             * @return      The excludedURL value
639:             * @since
640:             */
641:            private boolean isExcludedURL(String url) {
642:                // by default include URL for crawling
643:                if (excludeCrawlingURL == null) {
644:                    if (getLogger().isDebugEnabled()) {
645:                        getLogger().debug("exclude no URL " + url);
646:                    }
647:                    return false;
648:                }
649:
650:                final String s = url;
651:                Iterator i = excludeCrawlingURL.iterator();
652:                while (i.hasNext()) {
653:                    RE pattern = (RE) i.next();
654:                    if (pattern.match(s)) {
655:                        if (getLogger().isDebugEnabled()) {
656:                            getLogger().debug("exclude URL " + url);
657:                        }
658:                        return true;
659:                    }
660:                }
661:                if (getLogger().isDebugEnabled()) {
662:                    getLogger().debug("exclude not URL " + url);
663:                }
664:                return false;
665:            }
666:
667:            /**
668:             * check if URL is a candidate for indexing
669:             *
670:             * @param  url  Description of Parameter
671:             * @return      The includedURL value
672:             * @since
673:             */
674:            private boolean isIncludedURL(String url) {
675:                // by default include URL for crawling
676:                if (includeCrawlingURL == null) {
677:                    if (getLogger().isDebugEnabled()) {
678:                        getLogger().debug("include all URL " + url);
679:                    }
680:                    return true;
681:                }
682:
683:                final String s = url;
684:                Iterator i = includeCrawlingURL.iterator();
685:                while (i.hasNext()) {
686:                    RE pattern = (RE) i.next();
687:                    if (pattern.match(s)) {
688:                        if (getLogger().isDebugEnabled()) {
689:                            getLogger().debug("include URL " + url);
690:                        }
691:                        return true;
692:                    }
693:                }
694:                if (getLogger().isDebugEnabled()) {
695:                    getLogger().debug("include not URL " + url);
696:                }
697:                return false;
698:            }
699:
700:            public void recycle() {
701:                if (null != this.inputSource) {
702:                    super.resolver.release(this.inputSource);
703:                    this.inputSource = null;
704:                }
705:                this.manager.release(super.resolver);
706:                super.resolver = null;
707:                this.manager = null;
708:                this.attributes = null;
709:                super.recycle();
710:            }
711:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.