Source Code Cross Referenced for CrawlScope.java in » Web-Crawler » heritrix » org » archive » crawler » framework » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.framework

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /* Copyright (C) 2003 Internet Archive.
002:         *
003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
004:         *
005:         * Heritrix is free software; you can redistribute it and/or modify
006:         * it under the terms of the GNU Lesser Public License as published by
007:         * the Free Software Foundation; either version 2.1 of the License, or
008:         * any later version.
009:         *
010:         * Heritrix is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013:         * GNU Lesser Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser Public License
016:         * along with Heritrix; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         *
019:         * CrawlScope.java
020:         * Created on Oct 1, 2003
021:         *
022:         * $Header$
023:         */
024:        package org.archive.crawler.framework;
025:
026:        import java.io.BufferedReader;
027:        import java.io.File;
028:        import java.io.FileReader;
029:        import java.io.FileWriter;
030:        import java.io.IOException;
031:        import java.io.Writer;
032:        import java.util.HashSet;
033:        import java.util.Iterator;
034:        import java.util.List;
035:        import java.util.Set;
036:        import java.util.logging.Logger;
037:
038:        import javax.management.AttributeNotFoundException;
039:        import javax.management.MBeanException;
040:        import javax.management.ReflectionException;
041:
042:        import org.apache.commons.httpclient.URIException;
043:        import org.archive.crawler.datamodel.CandidateURI;
044:        import org.archive.crawler.scope.SeedFileIterator;
045:        import org.archive.crawler.scope.SeedListener;
046:        import org.archive.crawler.settings.CrawlerSettings;
047:        import org.archive.crawler.settings.SimpleType;
048:        import org.archive.crawler.settings.Type;
049:        import org.archive.net.UURI;
050:        import org.archive.util.DevUtils;
051:
052:        /**
053:         * A CrawlScope instance defines which URIs are "in"
054:         * a particular crawl.
055:         *
056:         * It is essentially a Filter which determines, looking at
057:         * the totality of information available about a
058:         * CandidateURI/CrawlURI instamce, if that URI should be
059:         * scheduled for crawling.
060:         *
061:         * Dynamic information inherent in the discovery of the
062:         * URI -- such as the path by which it was discovered --
063:         * may be considered.
064:         *
065:         * Dynamic information which requires the consultation
066:         * of external and potentially volatile information --
067:         * such as current robots.txt requests and the history
068:         * of attempts to crawl the same URI -- should NOT be
069:         * considered. Those potentially high-latency decisions
070:         * should be made at another step.
071:         *
072:         * @author gojomo
073:         *
074:         */
075:        public class CrawlScope extends Filter {
076:
077:            private static final long serialVersionUID = -3321533224526211277L;
078:
079:            private static final Logger logger = Logger
080:                    .getLogger(CrawlScope.class.getName());
081:            public static final String ATTR_NAME = "scope";
082:            public static final String ATTR_SEEDS = "seedsfile";
083:
084:            /**
085:             * Whether every configu change should trigger a 
086:             * rereading of the original seeds spec/file.
087:             */
088:            public static final String ATTR_REREAD_SEEDS_ON_CONFIG = "reread-seeds-on-config";
089:            public static final Boolean DEFAULT_REREAD_SEEDS_ON_CONFIG = Boolean.TRUE;
090:
091:            protected Set<SeedListener> seedListeners = new HashSet<SeedListener>();
092:
093:            /** Constructs a new CrawlScope.
094:             *
095:             * @param name the name is ignored since it always have to be the value of
096:             *        the constant ATT_NAME.
097:             */
098:            public CrawlScope(String name) {
099:                // 'name' is never used.
100:                super (ATTR_NAME, "Crawl scope");
101:                Type t;
102:                t = addElementToDefinition(new SimpleType(ATTR_SEEDS,
103:                        "File from which to extract seeds.", "seeds.txt"));
104:                t.setOverrideable(false);
105:                t.setExpertSetting(true);
106:                t = addElementToDefinition(new SimpleType(
107:                        ATTR_REREAD_SEEDS_ON_CONFIG,
108:                        "Whether to reread the seeds specification, whether it has "
109:                                + "changed or not, every time any configuration change occurs. "
110:                                + "If true, seeds are reread even when (for example) new "
111:                                + "domain overrides are set. Rereading the seeds can take a "
112:                                + "long time with large seed lists.",
113:                        DEFAULT_REREAD_SEEDS_ON_CONFIG));
114:                t.setOverrideable(false);
115:                t.setExpertSetting(true);
116:
117:            }
118:
119:            /** Default constructor.
120:             */
121:            public CrawlScope() {
122:                this (ATTR_NAME);
123:            }
124:
125:            /**
126:             * Initialize is called just before the crawler starts to run.
127:             *
128:             * The settings system is up and initialized so can be used.  This
129:             * initialize happens after {@link #earlyInitialize(CrawlerSettings)}.
130:             *
131:             * @param controller Controller object.
132:             */
133:            public void initialize(CrawlController controller) {
134:                // by default do nothing (subclasses override)
135:            }
136:
137:            public String toString() {
138:                return "CrawlScope<" + getName() + ">";
139:            }
140:
141:            /**
142:             * Refresh seeds.
143:             *
144:             */
145:            public void refreshSeeds() {
146:                // by default do nothing (subclasses which cache should override)
147:            }
148:
149:            /**
150:             * @return Seed list file or null if problem getting settings file.
151:             */
152:            public File getSeedfile() {
153:                File file = null;
154:                try {
155:                    file = getSettingsHandler()
156:                            .getPathRelativeToWorkingDirectory(
157:                                    (String) getAttribute(ATTR_SEEDS));
158:                    if (!file.exists() || !file.canRead()) {
159:                        throw new IOException("Seeds file "
160:                                + file.getAbsolutePath()
161:                                + " does not exist or unreadable.");
162:                    }
163:                } catch (IOException e) {
164:                    DevUtils.warnHandle(e, "problem reading seeds");
165:                } catch (AttributeNotFoundException e) {
166:                    DevUtils.warnHandle(e, "problem reading seeds");
167:                } catch (MBeanException e) {
168:                    DevUtils.warnHandle(e, "problem reading seeds");
169:                    e.printStackTrace();
170:                } catch (ReflectionException e) {
171:                    DevUtils.warnHandle(e, "problem reading seeds");
172:                    e.printStackTrace();
173:                }
174:
175:                return file;
176:            }
177:
178:            /** Check if a URI is in the seeds.
179:             *
180:             * @param o the URI to check.
181:             * @return true if URI is a seed.
182:             */
183:            protected boolean isSeed(Object o) {
184:                return o instanceof  CandidateURI && ((CandidateURI) o).isSeed();
185:            }
186:
187:            /**
188:             * @param a First UURI of compare.
189:             * @param b Second UURI of compare.
190:             * @return True if UURIs are of same host.
191:             */
192:            protected boolean isSameHost(UURI a, UURI b) {
193:                boolean isSameHost = false;
194:                if (a != null && b != null) {
195:                    // getHost can come back null.  See
196:                    // "[ 910120 ] java.net.URI#getHost fails when leading digit"
197:                    try {
198:                        if (a.getReferencedHost() != null
199:                                && b.getReferencedHost() != null) {
200:                            if (a.getReferencedHost().equals(
201:                                    b.getReferencedHost())) {
202:                                isSameHost = true;
203:                            }
204:                        }
205:                    } catch (URIException e) {
206:                        logger.severe("Failed compare of " + a + " " + b + ": "
207:                                + e.getMessage());
208:                    }
209:                }
210:                return isSameHost;
211:            }
212:
213:            /* (non-Javadoc)
214:             * @see org.archive.crawler.settings.ModuleType#listUsedFiles(java.util.List)
215:             */
216:            public void listUsedFiles(List<String> list) {
217:                // Add seed file
218:                try {
219:                    File file = getSettingsHandler()
220:                            .getPathRelativeToWorkingDirectory(
221:                                    (String) getAttribute(ATTR_SEEDS));
222:                    list.add(file.getAbsolutePath());
223:                } catch (AttributeNotFoundException e) {
224:                    // TODO Auto-generated catch block
225:                    e.printStackTrace();
226:                } catch (MBeanException e) {
227:                    // TODO Auto-generated catch block
228:                    e.printStackTrace();
229:                } catch (ReflectionException e) {
230:                    // TODO Auto-generated catch block
231:                    e.printStackTrace();
232:                }
233:            }
234:
235:            /**
236:             * Take note of a situation (such as settings edit) where
237:             * involved reconfiguration (such as reading from external
238:             * files) may be necessary.
239:             */
240:            public void kickUpdate() {
241:                // TODO: further improve this so that case with hundreds of
242:                // thousands or millions of seeds works better without requiring
243:                // this specific settings check 
244:                if (((Boolean) getUncheckedAttribute(null,
245:                        ATTR_REREAD_SEEDS_ON_CONFIG)).booleanValue()) {
246:                    refreshSeeds();
247:                    getSettingsHandler().getOrder().getController()
248:                            .getFrontier().loadSeeds();
249:                }
250:            }
251:
252:            /**
253:             * Gets an iterator over all configured seeds. Subclasses
254:             * which cache seeds in memory can override with more
255:             * efficient implementation. 
256:             *
257:             * @return Iterator, perhaps over a disk file, of seeds
258:             */
259:            public Iterator<UURI> seedsIterator() {
260:                return seedsIterator(null);
261:            }
262:
263:            /**
264:             * Gets an iterator over all configured seeds. Subclasses
265:             * which cache seeds in memory can override with more
266:             * efficient implementation. 
267:             *
268:             * @param ignoredItemWriter optional writer to get ignored seed items report
269:             * @return Iterator, perhaps over a disk file, of seeds
270:             */
271:            public Iterator<UURI> seedsIterator(Writer ignoredItemWriter) {
272:                BufferedReader br;
273:                try {
274:                    br = new BufferedReader(new FileReader(getSeedfile()));
275:                } catch (IOException e) {
276:                    throw new RuntimeException(e);
277:                }
278:                return new SeedFileIterator(br, ignoredItemWriter);
279:            }
280:
281:            /**
282:             * Convenience method to close SeedFileIterator, if appropriate.
283:             * 
284:             * @param iter Iterator to check if SeedFileIterator needing closing
285:             */
286:            protected void checkClose(Iterator iter) {
287:                if (iter instanceof  SeedFileIterator) {
288:                    ((SeedFileIterator) iter).close();
289:                }
290:            }
291:
292:            /**
293:             * Add a new seed to scope. By default, simply appends
294:             * to seeds file, though subclasses may handle differently.
295:             *
296:             * <p>This method is *not* sufficient to get the new seed 
297:             * scheduled in the Frontier for crawling -- it only 
298:             * affects the Scope's seed record (and decisions which
299:             * flow from seeds). 
300:             *
301:             * @param curi CandidateUri to add
302:             * @return true if successful, false if add failed for any reason
303:             */
304:            public boolean addSeed(final CandidateURI curi) {
305:                File f = getSeedfile();
306:                if (f != null) {
307:                    try {
308:                        FileWriter fw = new FileWriter(f, true);
309:                        // Write to new (last) line the URL.
310:                        fw.write("\n");
311:                        fw.write("# Heritrix added seed "
312:                                + ((curi.getVia() != null) ? "redirect from "
313:                                        + curi.getVia() : "(JMX)") + ".\n");
314:                        fw.write(curi.toString());
315:                        fw.flush();
316:                        fw.close();
317:                        Iterator iter = seedListeners.iterator();
318:                        while (iter.hasNext()) {
319:                            ((SeedListener) iter.next()).addedSeed(curi);
320:                        }
321:                        return true;
322:                    } catch (IOException e) {
323:                        DevUtils.warnHandle(e, "problem writing new seed");
324:                    }
325:                }
326:                return false;
327:            }
328:
329:            public void addSeedListener(SeedListener sl) {
330:                seedListeners.add(sl);
331:            }
332:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.