Source Code Cross Referenced for SimpleCocoonCrawlerImpl.java in  » Content-Management-System » apache-lenya-2.0 » org » apache » cocoon » components » crawler » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Content Management System » apache lenya 2.0 » org.apache.cocoon.components.crawler 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /*
002:         * Licensed to the Apache Software Foundation (ASF) under one or more
003:         * contributor license agreements.  See the NOTICE file distributed with
004:         * this work for additional information regarding copyright ownership.
005:         * The ASF licenses this file to You under the Apache License, Version 2.0
006:         * (the "License"); you may not use this file except in compliance with
007:         * the License.  You may obtain a copy of the License at
008:         * 
009:         *      http://www.apache.org/licenses/LICENSE-2.0
010:         * 
011:         * Unless required by applicable law or agreed to in writing, software
012:         * distributed under the License is distributed on an "AS IS" BASIS,
013:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014:         * See the License for the specific language governing permissions and
015:         * limitations under the License.
016:         */
017:        package org.apache.cocoon.components.crawler;
018:
019:        import org.apache.avalon.excalibur.pool.Recyclable;
020:        import org.apache.avalon.framework.activity.Disposable;
021:        import org.apache.avalon.framework.configuration.Configurable;
022:        import org.apache.avalon.framework.configuration.Configuration;
023:        import org.apache.avalon.framework.configuration.ConfigurationException;
024:        import org.apache.avalon.framework.logger.AbstractLogEnabled;
025:        import org.apache.cocoon.Constants;
026:        import org.apache.commons.lang.StringUtils;
027:        import org.apache.regexp.RE;
028:        import org.apache.regexp.RESyntaxException;
029:
030:        import java.io.BufferedReader;
031:        import java.io.IOException;
032:        import java.io.InputStream;
033:        import java.io.InputStreamReader;
034:        import java.net.URL;
035:        import java.net.URLConnection;
036:        import java.util.ArrayList;
037:        import java.util.HashSet;
038:        import java.util.Iterator;
039:        import java.util.List;
040:
041:        /**
042:         * A simple cocoon crawler.
043:         *
044:         * @author     <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
045:         * @version CVS $Id: SimpleCocoonCrawlerImpl.java 433543 2006-08-22 06:22:54Z crossley $
046:         */
047:        public class SimpleCocoonCrawlerImpl extends AbstractLogEnabled
048:                implements  CocoonCrawler, Configurable, Disposable, Recyclable {
049:
050:            /**
051:             * Config element name specifying expected link content-typ.
052:             * <p>
053:             *   Its value is <code>link-content-type</code>.
054:             * </p>
055:             */
056:            public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
057:
058:            /**
059:             * Default value of <code>link-content-type</code> configuration value.
060:             * <p>
061:             *   Its value is <code>application/x-cocoon-links</code>.
062:             * </p>
063:             */
064:            public final String LINK_CONTENT_TYPE_DEFAULT = Constants.LINK_CONTENT_TYPE;
065:
066:            /**
067:             * Config element name specifying query-string appendend for requesting links
068:             * of an URL.
069:             * <p>
070:             *  Its value is <code>link-view-query</code>.
071:             * </p>
072:             */
073:            public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";
074:
075:            /**
076:             * Default value of <code>link-view-query</code> configuration option.
077:             * <p>
078:             *   Its value is <code>?cocoon-view=links</code>.
079:             * </p>
080:             */
081:            public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
082:
083:            /**
084:             * Config element name specifying excluding regular expression pattern.
085:             * <p>
086:             *  Its value is <code>exclude</code>.
087:             * </p>
088:             */
089:            public final static String EXCLUDE_CONFIG = "exclude";
090:
091:            /**
092:             * Config element name specifying including regular expression pattern.
093:             * <p>
094:             *  Its value is <code>include</code>.
095:             * </p>
096:             */
097:            public final static String INCLUDE_CONFIG = "include";
098:
099:            /**
100:             * Config element name specifying http header value for user-Agent.
101:             * <p>
102:             *  Its value is <code>user-agent</code>.
103:             * </p>
104:             */
105:            public final static String USER_AGENT_CONFIG = "user-agent";
106:
107:            /**
108:             * Default value of <code>user-agent</code> configuration option.
109:             * @see Constants#COMPLETE_NAME
110:             */
111:            public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
112:
113:            /**
114:             * Config element name specifying http header value for accept.
115:             * <p>
116:             *  Its value is <code>accept</code>.
117:             * </p>
118:             */
119:            public final static String ACCEPT_CONFIG = "accept";
120:
121:            /**
122:             * Default value of <code>accept</code> configuration option.
123:             * <p>
124:             *   Its value is <code>* / *</code>
125:             * </p>
126:             */
127:            public final static String ACCEPT_DEFAULT = "*/*";
128:
129:            private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
130:            private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
131:            private HashSet excludeCrawlingURL;
132:            private HashSet includeCrawlingURL;
133:            private String userAgent = USER_AGENT_DEFAULT;
134:            private String accept = ACCEPT_DEFAULT;
135:            private HashSet crawled;
136:
137:            protected int depth;
138:            protected HashSet urlsToProcess;
139:            protected HashSet urlsNextDepth;
140:
141:            /**
142:             * Constructor for the SimpleCocoonCrawlerImpl object
143:             */
144:            public SimpleCocoonCrawlerImpl() {
145:                // by default include everything
146:                includeCrawlingURL = null;
147:                // by default exclude common image patterns
148:                excludeCrawlingURL = null;
149:            }
150:
151:            /**
152:             * Configure the crawler component.
153:             * <p>
154:             *  Configure can specify which URI to include, and which URI to exclude
155:             *  from crawling. You specify the patterns as regular expressions.
156:             * </p>
157:             * <p>
158:             *  Morover you can configure
159:             *  the required content-type of crawling request, and the
160:             *  query-string appended to each crawling request.
161:             * </p>
162:             * <pre><tt>
163:             * &lt;include&gt;.*\.html?&lt;/exclude&gt; or &lt;exclude&gt;.*\.html?, .*\.xsp&lt;/exclude&gt;
164:             * &lt;exclude&gt;.*\.gif&lt;/exclude&gt; or &lt;exclude&gt;.*\.gif, .*\.jpe?g&lt;/exclude&gt;
165:             * &lt;link-content-type&gt; application/x-cocoon-links &lt;/link-content-type&gt;
166:             * &lt;link-view-query&gt; ?cocoon-view=links &lt;/link-view-query&gt;
167:             * </tt></pre>
168:             *
169:             * @param  configuration               XML configuration of this avalon component.
170:             * @exception  ConfigurationException  is throwing if configuration is invalid.
171:             */
172:            public void configure(Configuration configuration)
173:                    throws ConfigurationException {
174:
175:                Configuration[] children;
176:                children = configuration.getChildren(INCLUDE_CONFIG);
177:                if (children.length > 0) {
178:                    includeCrawlingURL = new HashSet();
179:                    for (int i = 0; i < children.length; i++) {
180:                        String pattern = children[i].getValue();
181:                        try {
182:                            String params[] = StringUtils.split(pattern, ", ");
183:                            for (int index = 0; index < params.length; index++) {
184:                                String tokenized_pattern = params[index];
185:                                this .includeCrawlingURL.add(new RE(
186:                                        tokenized_pattern));
187:                            }
188:                        } catch (RESyntaxException rese) {
189:                            getLogger().error(
190:                                    "Cannot create including regular-expression for "
191:                                            + pattern, rese);
192:                        }
193:                    }
194:                } else {
195:                    if (getLogger().isDebugEnabled()) {
196:                        getLogger().debug("Include all URLs");
197:                    }
198:                }
199:
200:                children = configuration.getChildren(EXCLUDE_CONFIG);
201:                if (children.length > 0) {
202:                    excludeCrawlingURL = new HashSet();
203:                    for (int i = 0; i < children.length; i++) {
204:                        String pattern = children[i].getValue();
205:                        try {
206:                            String params[] = StringUtils.split(pattern, ", ");
207:                            for (int index = 0; index < params.length; index++) {
208:                                String tokenized_pattern = params[index];
209:                                this .excludeCrawlingURL.add(new RE(
210:                                        tokenized_pattern));
211:                            }
212:                        } catch (RESyntaxException rese) {
213:                            getLogger().error(
214:                                    "Cannot create excluding regular-expression for "
215:                                            + pattern, rese);
216:                        }
217:                    }
218:                } else {
219:                    excludeCrawlingURL = new HashSet();
220:                    setDefaultExcludeFromCrawling();
221:                    if (getLogger().isDebugEnabled()) {
222:                        getLogger().debug("Exclude default URLs only");
223:                    }
224:                }
225:
226:                Configuration child;
227:                String value;
228:                child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
229:                if (child != null) {
230:                    value = child.getValue();
231:                    if (value != null && value.length() > 0) {
232:                        this .linkContentType = value.trim();
233:                    }
234:                }
235:                child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
236:                if (child != null) {
237:                    value = child.getValue();
238:                    if (value != null && value.length() > 0) {
239:                        this .linkViewQuery = value.trim();
240:                    }
241:                }
242:
243:                child = configuration.getChild(USER_AGENT_CONFIG, false);
244:                if (child != null) {
245:                    value = child.getValue();
246:                    if (value != null && value.length() > 0) {
247:                        this .userAgent = value;
248:                    }
249:                }
250:
251:                child = configuration.getChild(ACCEPT_CONFIG, false);
252:                if (child != null) {
253:                    value = child.getValue();
254:                    if (value != null && value.length() > 0) {
255:                        this .accept = value;
256:                    }
257:                }
258:
259:            }
260:
261:            /**
262:             * dispose at end of life cycle, releasing all resources.
263:             */
264:            public void dispose() {
265:                crawled = null;
266:                urlsToProcess = null;
267:                urlsNextDepth = null;
268:                excludeCrawlingURL = null;
269:                includeCrawlingURL = null;
270:            }
271:
272:            /**
273:             * recylcle this object, relasing resources
274:             */
275:            public void recycle() {
276:                crawled = null;
277:                urlsToProcess = null;
278:                urlsNextDepth = null;
279:                depth = -1;
280:            }
281:
282:            /**
283:             * The same as calling crawl(url,-1);
284:             *
285:             * @param  url  Crawl this URL, getting all links from this URL.
286:             */
287:            public void crawl(URL url) {
288:                crawl(url, -1);
289:            }
290:
291:            /**
292:             * Start crawling a URL.
293:             *
294:             * <p>
295:             *   Use this method to start crawling.
296:             *   Get the this url, and all its children  by using <code>iterator()</code>.
297:             *   The Iterator object will return URL objects.
298:             * </p>
299:             * <p>
300:             *  You may use the crawl(), and iterator() methods the following way:
301:             * </p>
302:             * <pre><tt>
303:             *   SimpleCocoonCrawlerImpl scci = ....;
304:             *   scci.crawl( "http://foo/bar" );
305:             *   Iterator i = scci.iterator();
306:             *   while (i.hasNext()) {
307:             *     URL url = (URL)i.next();
308:             *     ...
309:             *   }
310:             * </tt></pre>
311:             * <p>
312:             *   The i.next() method returns a URL, and calculates the links of the
313:             *   URL before return it.
314:             * </p>
315:             *
316:             * @param  url  Crawl this URL, getting all links from this URL.
317:             * @param  maxDepth  maximum depth to crawl to. -1 for no maximum.
318:             */
319:            public void crawl(URL url, int maxDepth) {
320:                crawled = new HashSet();
321:                urlsToProcess = new HashSet();
322:                urlsNextDepth = new HashSet();
323:                depth = maxDepth;
324:
325:                if (getLogger().isDebugEnabled()) {
326:                    getLogger().debug(
327:                            "crawl URL " + url + " to depth " + maxDepth);
328:                }
329:
330:                urlsToProcess.add(url);
331:            }
332:
333:            /**
334:             * Return iterator, iterating over all links of the currently crawled URL.
335:             * <p>
336:             *   The Iterator object will return URL objects at its <code>next()</code>
337:             *   method.
338:             * </p>
339:             *
340:             * @return    Iterator iterator of all links from the crawl URL.
341:             * @since
342:             */
343:            public Iterator iterator() {
344:                return new CocoonCrawlerIterator(this );
345:            }
346:
347:            /**
348:             * Default exclude patterns.
349:             * <p>
350:             *   By default URLs matching following patterns are excluded:
351:             * </p>
352:             * <ul>
353:             *   <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
354:             *   <li>.*\\.png(\\?.*)?$ - exclude png images</li>
355:             *   <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
356:             *   <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
357:             *   <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
358:             * </ul>
359:             *
360:             * @since
361:             */
362:            private void setDefaultExcludeFromCrawling() {
363:                String[] EXCLUDE_FROM_CRAWLING_DEFAULT = { ".*\\.gif(\\?.*)?$",
364:                        ".*\\.png(\\?.*)?$", ".*\\.jpe?g(\\?.*)?$",
365:                        ".*\\.js(\\?.*)?$", ".*\\.css(\\?.*)?$" };
366:
367:                for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
368:                    String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
369:                    try {
370:                        excludeCrawlingURL.add(new RE(pattern));
371:                    } catch (RESyntaxException rese) {
372:                        getLogger().error(
373:                                "Cannot create excluding regular-expression for "
374:                                        + pattern, rese);
375:                    }
376:                }
377:            }
378:
379:            /**
380:             * Compute list of links from the url.
381:             * <p>
382:             *   Check for include, exclude pattern, content-type, and if url
383:             *   has been craweled already.
384:             * </p>
385:             *
386:             * @param  url  Crawl this URL
387:             * @return      List of URLs, which are links from url, asserting the conditions.
388:             * @since
389:             */
390:            private List getLinks(URL url) {
391:                ArrayList url_links = null;
392:                String sURL = url.toString();
393:
394:                if (!isIncludedURL(sURL) || isExcludedURL(sURL)) {
395:                    return null;
396:                }
397:
398:                // don't try to get links for url which has been crawled already
399:                if (crawled.contains(sURL)) {
400:                    return null;
401:                }
402:
403:                // mark it as crawled
404:                crawled.add(sURL);
405:
406:                // get links of url
407:                if (getLogger().isDebugEnabled()) {
408:                    getLogger().debug("Getting links of URL " + sURL);
409:                }
410:                BufferedReader br = null;
411:                try {
412:                    sURL = url.getFile();
413:                    URL links = new URL(url, sURL
414:                            + ((sURL.indexOf("?") == -1) ? "?" : "&")
415:                            + linkViewQuery);
416:                    URLConnection links_url_connection = links.openConnection();
417:                    links_url_connection.setRequestProperty("Accept", accept);
418:                    links_url_connection.setRequestProperty("User-Agent",
419:                            userAgent);
420:                    links_url_connection.connect();
421:                    InputStream is = links_url_connection.getInputStream();
422:                    br = new BufferedReader(new InputStreamReader(is));
423:
424:                    String contentType = links_url_connection.getContentType();
425:                    if (contentType == null) {
426:                        if (getLogger().isDebugEnabled()) {
427:                            getLogger().debug(
428:                                    "Ignoring " + sURL + " (no content type)");
429:                        }
430:                        // there is a check on null in the calling method
431:                        return null;
432:                    }
433:
434:                    int index = contentType.indexOf(';');
435:                    if (index != -1) {
436:                        contentType = contentType.substring(0, index);
437:                    }
438:
439:                    if (getLogger().isDebugEnabled()) {
440:                        getLogger().debug("Content-type: " + contentType);
441:                    }
442:
443:                    if (contentType.equals(linkContentType)) {
444:                        url_links = new ArrayList();
445:
446:                        // content is supposed to be a list of links,
447:                        // relative to current URL
448:                        String line;
449:                        while ((line = br.readLine()) != null) {
450:                            final URL newUrl = new URL(url, line);
451:                            final String sNewUrl = newUrl.toString();
452:
453:                            boolean add_url = true;
454:                            // don't add new_url twice
455:                            if (add_url) {
456:                                add_url &= !url_links.contains(sNewUrl);
457:                            }
458:
459:                            // don't add new_url if it has been crawled already
460:                            if (add_url) {
461:                                add_url &= !crawled.contains(sNewUrl);
462:                            }
463:
464:                            // don't add if is not matched by existing include definition
465:                            if (add_url) {
466:                                add_url &= isIncludedURL(sNewUrl);
467:                            }
468:
469:                            // don't add if is matched by existing exclude definition
470:                            if (add_url) {
471:                                add_url &= !isExcludedURL(sNewUrl);
472:                            }
473:                            if (add_url) {
474:                                if (getLogger().isDebugEnabled()) {
475:                                    getLogger().debug("Add URL: " + sNewUrl);
476:                                }
477:                                url_links.add(newUrl);
478:                            }
479:                        }
480:                        // now we have a list of URL which should be examined
481:                    }
482:                } catch (IOException ioe) {
483:                    getLogger().warn("Problems get links of " + url, ioe);
484:                } finally {
485:                    if (br != null) {
486:                        try {
487:                            br.close();
488:                            br = null;
489:                        } catch (IOException ignored) {
490:                        }
491:                    }
492:                }
493:                return url_links;
494:            }
495:
496:            /**
497:             * check if URL is a candidate for indexing
498:             *
499:             * @param  url  the URL to check
500:             * @return      The excludedURL value
501:             */
502:            private boolean isExcludedURL(String url) {
503:                // by default do not exclude URL for crawling
504:                if (excludeCrawlingURL == null) {
505:                    return false;
506:                }
507:
508:                final String s = url;
509:                Iterator i = excludeCrawlingURL.iterator();
510:                while (i.hasNext()) {
511:                    RE pattern = (RE) i.next();
512:                    if (pattern.match(s)) {
513:                        if (getLogger().isDebugEnabled()) {
514:                            getLogger().debug("Excluded URL " + url);
515:                        }
516:                        return true;
517:                    }
518:                }
519:                if (getLogger().isDebugEnabled()) {
520:                    getLogger().debug("Not excluded URL " + url);
521:                }
522:                return false;
523:            }
524:
525:            /**
526:             * check if URL is a candidate for indexing
527:             *
528:             * @param  url  Description of Parameter
529:             * @return      The includedURL value
530:             */
531:            private boolean isIncludedURL(String url) {
532:                // by default include URL for crawling
533:                if (includeCrawlingURL == null) {
534:                    return true;
535:                }
536:
537:                final String s = url;
538:                Iterator i = includeCrawlingURL.iterator();
539:                while (i.hasNext()) {
540:                    RE pattern = (RE) i.next();
541:                    if (pattern.match(s)) {
542:                        if (getLogger().isDebugEnabled()) {
543:                            getLogger().debug("Included URL " + url);
544:                        }
545:                        return true;
546:                    }
547:                }
548:                if (getLogger().isDebugEnabled()) {
549:                    getLogger().debug("Not included URL " + url);
550:                }
551:                return false;
552:            }
553:
554:            /**
555:             * Helper class implementing an Iterator
556:             * <p>
557:             *   This Iterator implementation calculates the links of an URL
558:             *   before returning in the next() method.
559:             * </p>
560:             *
561:             * @author     <a href="mailto:berni_huber@a1.net">Bernhard Huber</a>
562:             * @version    $Id: SimpleCocoonCrawlerImpl.java 433543 2006-08-22 06:22:54Z crossley $
563:             */
564:            public static class CocoonCrawlerIterator implements  Iterator {
565:                private SimpleCocoonCrawlerImpl cocoonCrawler;
566:
567:                /**
568:                 * Constructor for the CocoonCrawlerIterator object
569:                 *
570:                 * @param  cocoonCrawler  the containing CocoonCrawler instance.
571:                 */
572:                CocoonCrawlerIterator(SimpleCocoonCrawlerImpl cocoonCrawler) {
573:                    this .cocoonCrawler = cocoonCrawler;
574:                }
575:
576:                /**
577:                 * check if crawling is finished.
578:                 *
579:                 * @return    <code>true</code> if crawling has finished,
580:                 * else <code>false</code>.
581:                 */
582:                public boolean hasNext() {
583:                    return cocoonCrawler.urlsToProcess.size() > 0
584:                            || cocoonCrawler.urlsNextDepth.size() > 0;
585:                }
586:
587:                /**
588:                 * @return    the next URL
589:                 */
590:                public Object next() {
591:                    if (cocoonCrawler.urlsToProcess.size() == 0
592:                            && cocoonCrawler.urlsNextDepth.size() > 0) {
593:                        // process queued urls belonging to the next depth level
594:                        cocoonCrawler.urlsToProcess = cocoonCrawler.urlsNextDepth;
595:                        cocoonCrawler.urlsNextDepth = new HashSet();
596:                        // fix Bugzilla Bug 25270
597:                        // only decrease if depth > 0, excluding decreasing
598:                        // if depth is already equal to -1
599:                        if (cocoonCrawler.depth > 0) {
600:                            cocoonCrawler.depth--;
601:                        }
602:                    }
603:                    URL theNextUrl = null;
604:                    // fix Bugzilla Bug 25270
605:                    // return NextUrl != null only if getLinks() returns non-null
606:                    // list
607:                    for (Iterator i = cocoonCrawler.urlsToProcess.iterator(); i
608:                            .hasNext()
609:                            && theNextUrl == null;) {
610:                        // fetch a URL
611:                        URL url = (URL) i.next();
612:
613:                        // remove it from the to-do list
614:                        i.remove();
615:
616:                        if (cocoonCrawler.depth == -1
617:                                || cocoonCrawler.depth > 0) {
618:                            // calc all links from this url
619:                            List url_links = cocoonCrawler.getLinks(url);
620:                            if (url_links != null) {
621:                                // add links of this url to the to-do list
622:                                cocoonCrawler.urlsNextDepth.addAll(url_links);
623:                                theNextUrl = url;
624:                            }
625:                        }
626:                    }
627:                    // finally return url
628:                    return theNextUrl;
629:                }
630:
631:                /**
632:                 * remove is not implemented
633:                 */
634:                public void remove() {
635:                    throw new UnsupportedOperationException(
636:                            "remove is not implemented");
637:                }
638:            }
639:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.