Source Code Cross Referenced for CrawlOrder.java in » Web-Crawler » heritrix » org » archive » crawler » datamodel » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.datamodel
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         * CrawlOrder
003:         *
004:         * $Header$
005:         *
006:         * Created on May 15, 2003
007:         *
008:         * Copyright (C) 2003 Internet Archive.
009:         *
010:         * This file is part of the Heritrix web crawler (crawler.archive.org).
011:         *
012:         * Heritrix is free software; you can redistribute it and/or modify
013:         * it under the terms of the GNU Lesser Public License as published by
014:         * the Free Software Foundation; either version 2.1 of the License, or
015:         * any later version.
016:         *
017:         * Heritrix is distributed in the hope that it will be useful,
018:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
019:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
020:         * GNU Lesser Public License for more details.
021:         *
022:         * You should have received a copy of the GNU Lesser Public License
023:         * along with Heritrix; if not, write to the Free Software
024:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
025:         *
026:         */
027:
028:        package org.archive.crawler.datamodel;
029:
030:        import java.io.File;
031:        import java.io.Serializable;
032:        import java.util.logging.Logger;
033:
034:        import javax.management.AttributeNotFoundException;
035:
036:        import org.archive.crawler.framework.CrawlController;
037:        import org.archive.crawler.framework.CrawlScope;
038:        import org.archive.crawler.framework.Frontier;
039:        import org.archive.crawler.framework.Processor;
040:        import org.archive.crawler.framework.exceptions.FatalConfigurationException;
041:        import org.archive.crawler.settings.MapType;
042:        import org.archive.crawler.settings.ModuleType;
043:        import org.archive.crawler.settings.SimpleType;
044:        import org.archive.crawler.settings.Type;
045:        import org.archive.crawler.url.canonicalize.BaseRule;
046:
047:        /**
048:         * Represents the 'root' of the settings hierarchy. Contains those settings that
049:         * do not belong to any specific module, but rather relate to the crawl as a
050:         * whole (much of this is used by the CrawlController directly or indirectly).
051:         *
052:         * @see org.archive.crawler.settings.ModuleType
053:         */
054:        public class CrawlOrder extends ModuleType implements  Serializable {
055:
056:            private static final long serialVersionUID = -6715840285961511669L;
057:
058:            private static Logger logger = Logger
059:                    .getLogger("org.archive.crawler.datamodel.CrawlOrder");
060:
061:            public static final String ATTR_NAME = "crawl-order";
062:            public static final String ATTR_SETTINGS_DIRECTORY = "settings-directory";
063:            public static final String ATTR_DISK_PATH = "disk-path";
064:            public static final String ATTR_LOGS_PATH = "logs-path";
065:            public static final String ATTR_CHECKPOINTS_PATH = "checkpoints-path";
066:            public static final String ATTR_STATE_PATH = "state-path";
067:            public static final String ATTR_SCRATCH_PATH = "scratch-path";
068:            public static final String ATTR_RECOVER_PATH = "recover-path";
069:            public static final String ATTR_RECOVER_RETAIN_FAILURES = "recover-retain-failures";
070:            public static final String ATTR_MAX_BYTES_DOWNLOAD = "max-bytes-download";
071:            public static final String ATTR_MAX_DOCUMENT_DOWNLOAD = "max-document-download";
072:            public static final String ATTR_MAX_TIME_SEC = "max-time-sec";
073:            public static final String ATTR_MAX_TOE_THREADS = "max-toe-threads";
074:            public static final String ATTR_HTTP_HEADERS = "http-headers";
075:            public static final String ATTR_USER_AGENT = "user-agent";
076:            public static final String ATTR_FROM = "from";
077:            public static final String ATTR_PRE_FETCH_PROCESSORS = "pre-fetch-processors";
078:            public static final String ATTR_FETCH_PROCESSORS = "fetch-processors";
079:            public static final String ATTR_EXTRACT_PROCESSORS = "extract-processors";
080:            public static final String ATTR_WRITE_PROCESSORS = "write-processors";
081:            public static final String ATTR_POST_PROCESSORS = "post-processors";
082:            public static final String ATTR_LOGGERS = "loggers";
083:            public static final String ATTR_RULES = "uri-canonicalization-rules";
084:            public static final String ATTR_RECORDER_OUT_BUFFER = "recorder-out-buffer-bytes";
085:            public static final String ATTR_RECORDER_IN_BUFFER = "recorder-in-buffer-bytes";
086:
087:            /** Percentage of heap to allocate to bdb cache */
088:            public static final String ATTR_BDB_CACHE_PERCENT = "bdb-cache-percent";
089:
090:            /**
091:             * When checkpointing, copy the bdb logs.
092:             * Default is true.  If false, then we do not copy logs on checkpoint AND
093:             * we tell bdbje never to delete log files; instead it renames
094:             * files-to-delete with a '.del' extension.  Assumption is that when this
095:             * setting is false, an external process is managing the removing of
096:             * bdbje log files and that come time to recover from a checkpoint, the
097:             * files that comprise a checkpoint are manually assembled.
098:             */
099:            public static final String ATTR_CHECKPOINT_COPY_BDBJE_LOGS = "checkpoint-copy-bdbje-logs";
100:            public static final Boolean DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS = Boolean.TRUE;
101:
102:            /**
103:             * Default size of bdb cache.
104:             */
105:            private final static Integer DEFAULT_BDB_CACHE_PERCENT = new Integer(
106:                    0);
107:
108:            private transient MapType httpHeaders;
109:            private transient MapType loggers;
110:
111:            private transient CrawlController controller;
112:
113:            /**
114:             * Regex for acceptable user-agent format.
115:             */
116:            private static String ACCEPTABLE_USER_AGENT = "\\S+.*\\(.*\\+http(s)?://\\S+\\.\\S+.*\\).*";
117:
118:            /**
119:             * Regex for acceptable from address.
120:             */
121:            private static String ACCEPTABLE_FROM = "\\S+@\\S+\\.\\S+";
122:
123:            /** Construct a CrawlOrder.
124:             */
125:            public CrawlOrder() {
126:                super (ATTR_NAME,
127:                        "Heritrix crawl order. This forms the root of "
128:                                + "the settings framework.");
129:                Type e;
130:
131:                e = addElementToDefinition(new SimpleType(
132:                        ATTR_SETTINGS_DIRECTORY,
133:                        "Directory where override settings are kept. The settings "
134:                                + "for many modules can be overridden based on the domain or "
135:                                + "subdomain of the URI being processed. This setting specifies"
136:                                + " a file level directory to store those settings. The path"
137:                                + " is relative to 'disk-path' unless"
138:                                + " an absolute path is provided.", "settings"));
139:                e.setOverrideable(false);
140:                e.setExpertSetting(true);
141:
142:                e = addElementToDefinition(new SimpleType(
143:                        ATTR_DISK_PATH,
144:                        "Directory where logs, arcs and other run time files will "
145:                                + "be kept. If this path is a relative path, it will be "
146:                                + "relative to the crawl order.", ""));
147:                e.setOverrideable(false);
148:                e.setExpertSetting(true);
149:
150:                e = addElementToDefinition(new SimpleType(
151:                        ATTR_LOGS_PATH,
152:                        "Directory where crawler log files will be kept. If this path "
153:                                + "is a relative path, it will be relative to the 'disk-path'.",
154:                        "logs"));
155:                e.setOverrideable(false);
156:                e.setExpertSetting(true);
157:
158:                e = addElementToDefinition(new SimpleType(
159:                        ATTR_CHECKPOINTS_PATH,
160:                        "Directory where crawler checkpoint files will be kept. "
161:                                + "If this path "
162:                                + "is a relative path, it will be relative to the 'disk-path'.",
163:                        "checkpoints"));
164:                e.setOverrideable(false);
165:                e.setExpertSetting(true);
166:
167:                e = addElementToDefinition(new SimpleType(
168:                        ATTR_STATE_PATH,
169:                        "Directory where crawler-state files will be kept. If this path "
170:                                + "is a relative path, it will be relative to the 'disk-path'.",
171:                        "state"));
172:                e.setOverrideable(false);
173:                e.setExpertSetting(true);
174:
175:                e = addElementToDefinition(new SimpleType(
176:                        ATTR_SCRATCH_PATH,
177:                        "Directory where discardable temporary files will be kept. "
178:                                + "If this path "
179:                                + "is a relative path, it will be relative to the 'disk-path'.",
180:                        "scratch"));
181:                e.setOverrideable(false);
182:                e.setExpertSetting(true);
183:
184:                e = addElementToDefinition(new SimpleType(
185:                        ATTR_MAX_BYTES_DOWNLOAD,
186:                        "Maximum number of bytes to download. Once this number is"
187:                                + " exceeded the crawler will stop. "
188:                                + "A value of zero means no upper limit.",
189:                        new Long(0)));
190:                e.setOverrideable(false);
191:
192:                e = addElementToDefinition(new SimpleType(
193:                        ATTR_MAX_DOCUMENT_DOWNLOAD,
194:                        "Maximum number of documents to download. Once this number"
195:                                + " is exceeded the crawler will stop. "
196:                                + "A value of zero means no upper limit.",
197:                        new Long(0)));
198:                e.setOverrideable(false);
199:
200:                e = addElementToDefinition(new SimpleType(
201:                        ATTR_MAX_TIME_SEC,
202:                        "Maximum amount of time to crawl (in seconds). Once this"
203:                                + " much time has elapsed the crawler will stop. A value of"
204:                                + " zero means no upper limit.", new Long(0)));
205:                e.setOverrideable(false);
206:
207:                e = addElementToDefinition(new SimpleType(
208:                        ATTR_MAX_TOE_THREADS,
209:                        "Maximum number of threads processing URIs at the same time.",
210:                        new Integer(100)));
211:                e.setOverrideable(false);
212:
213:                e = addElementToDefinition(new SimpleType(
214:                        ATTR_RECORDER_OUT_BUFFER,
215:                        "Size in bytes of in-memory buffer to record outbound "
216:                                + "traffic. One such buffer is reserved for every ToeThread.",
217:                        new Integer(4096)));
218:                e.setOverrideable(false);
219:                e.setExpertSetting(true);
220:
221:                e = addElementToDefinition(new SimpleType(
222:                        ATTR_RECORDER_IN_BUFFER,
223:                        "Size in bytes of in-memory buffer to record inbound "
224:                                + "traffic. One such buffer is reserved for every ToeThread.",
225:                        new Integer(65536)));
226:                e.setOverrideable(false);
227:                e.setExpertSetting(true);
228:
229:                e = addElementToDefinition(new SimpleType(
230:                        ATTR_BDB_CACHE_PERCENT,
231:                        "Percentage of heap to allocate to BerkeleyDB JE cache. "
232:                                + "Default of zero means no preference (accept BDB's default, "
233:                                + "usually 60%, or the je.maxMemoryPercent property value).",
234:                        DEFAULT_BDB_CACHE_PERCENT));
235:                e.setExpertSetting(true);
236:                e.setOverrideable(false);
237:
238:                addElementToDefinition(new CrawlScope());
239:
240:                httpHeaders = (MapType) addElementToDefinition(new MapType(
241:                        ATTR_HTTP_HEADERS,
242:                        "HTTP headers. Information that will "
243:                                + "be used when constructing the HTTP headers of "
244:                                + "the crawler's HTTP requests."));
245:
246:                e = httpHeaders
247:                        .addElementToDefinition(new SimpleType(
248:                                ATTR_USER_AGENT,
249:                                "User agent to act as. Field must contain valid URL "
250:                                        + "that links to website of person or organization "
251:                                        + "running the crawl. Replace 'PROJECT_URL_HERE' in "
252:                                        + "initial template. E.g. If organization "
253:                                        + "is Library of Congress, a valid user agent would be:"
254:                                        + "'Mozilla/5.0 (compatible; loc-crawler/0.11.0 "
255:                                        + "+http://loc.gov)'. "
256:                                        + "Note, you must preserve the '+' before the 'http'.",
257:                                "Mozilla/5.0 (compatible; heritrix/@VERSION@ +PROJECT_URL_HERE)"));
258:
259:                e = httpHeaders
260:                        .addElementToDefinition(new SimpleType(
261:                                ATTR_FROM,
262:                                "Contact information. This field must contain a valid "
263:                                        + "e-mail address for the person or organization responsible"
264:                                        + "for this crawl: e.g. 'webmaster@loc.gov'",
265:                                "CONTACT_EMAIL_ADDRESS_HERE"));
266:
267:                addElementToDefinition(new RobotsHonoringPolicy());
268:
269:                e = addElementToDefinition(new ModuleType(Frontier.ATTR_NAME,
270:                        "Frontier"));
271:                e.setLegalValueType(Frontier.class);
272:
273:                e = (MapType) addElementToDefinition(new MapType(
274:                        ATTR_RULES,
275:                        "Ordered list of url canonicalization rules. "
276:                                + "Rules are applied in the order listed from top to bottom.",
277:                        BaseRule.class));
278:                e.setOverrideable(true);
279:                e.setExpertSetting(true);
280:
281:                e = addElementToDefinition(new MapType(
282:                        ATTR_PRE_FETCH_PROCESSORS, "Processors to run prior to"
283:                                + " fetching anything from the network.",
284:                        Processor.class));
285:                e.setOverrideable(false);
286:
287:                e = addElementToDefinition(new MapType(ATTR_FETCH_PROCESSORS,
288:                        "Processors that fetch documents.", Processor.class));
289:                e.setOverrideable(false);
290:
291:                e = addElementToDefinition(new MapType(ATTR_EXTRACT_PROCESSORS,
292:                        "Processors that extract new URIs"
293:                                + " from fetched documents.", Processor.class));
294:                e.setOverrideable(false);
295:
296:                e = addElementToDefinition(new MapType(ATTR_WRITE_PROCESSORS,
297:                        "Processors that write documents" + " to archives.",
298:                        Processor.class));
299:                e.setOverrideable(false);
300:
301:                e = addElementToDefinition(new MapType(ATTR_POST_PROCESSORS,
302:                        "Processors that do cleanup and feed"
303:                                + " the frontier with new URIs.",
304:                        Processor.class));
305:                e.setOverrideable(false);
306:
307:                loggers = (MapType) addElementToDefinition(new MapType(
308:                        ATTR_LOGGERS,
309:                        "Statistics tracking modules. Any number of specialized "
310:                                + "statistics tracker that monitor a crawl and write logs, "
311:                                + "reports and/or provide information to the user interface."));
312:
313:                e = addElementToDefinition(new SimpleType(
314:                        ATTR_RECOVER_PATH,
315:                        "Optional. Points at recover log (or recover.gz log) OR "
316:                                + "the checkpoint directory to use recovering a crawl.",
317:                        ""));
318:                e.setOverrideable(false);
319:                e.setExpertSetting(true);
320:
321:                e = addElementToDefinition(new SimpleType(
322:                        ATTR_CHECKPOINT_COPY_BDBJE_LOGS,
323:                        "When true, on a checkpoint, we copy off the bdbje log files to "
324:                                + "the checkpoint directory. To recover a checkpoint, just "
325:                                + "set the "
326:                                + ATTR_RECOVER_PATH
327:                                + " to point at the checkpoint "
328:                                + "directory to recover.  This is default setting. "
329:                                + "But if crawl is large, "
330:                                + "copying bdbje log files can take tens of minutes and even "
331:                                + "upwards of an hour (Copying bdbje log files will consume bulk "
332:                                + "of time checkpointing). If this setting is false, we do NOT copy "
333:                                + "bdbje logs on checkpoint AND we set bdbje to NEVER delete log "
334:                                + "files (instead we have it rename files-to-delete with a '.del'"
335:                                + "extension). Assumption is that when this setting is false, "
336:                                + "an external process is managing the removal of bdbje log files "
337:                                + "and that come time to recover from a checkpoint, the files that "
338:                                + "comprise a checkpoint are manually assembled. This is an expert "
339:                                + "setting.",
340:                        DEFAULT_CHECKPOINT_COPY_BDBJE_LOGS));
341:                e.setOverrideable(false);
342:                e.setExpertSetting(true);
343:
344:                e = addElementToDefinition(new SimpleType(
345:                        ATTR_RECOVER_RETAIN_FAILURES,
346:                        "When recovering via the recover.log, should failures "
347:                                + "in the log be retained in the recovered crawl, "
348:                                + "preventing the corresponding URIs from being retried. "
349:                                + "Default is false, meaning failures are forgotten, and "
350:                                + "the corresponding URIs will be retried in the recovered "
351:                                + "crawl.", Boolean.FALSE));
352:                e.setOverrideable(false);
353:                e.setExpertSetting(true);
354:
355:                e = addElementToDefinition(new CredentialStore(
356:                        CredentialStore.ATTR_NAME));
357:                e.setOverrideable(true);
358:                e.setExpertSetting(true);
359:            }
360:
361:            /**
362:             * @param curi
363:             * @return user-agent header value to use
364:             */
365:            public String getUserAgent(CrawlURI curi) {
366:                return ((String) httpHeaders.getUncheckedAttribute(curi,
367:                        ATTR_USER_AGENT));
368:            }
369:
370:            /**
371:             * @param curi
372:             * @return from header value to use
373:             */
374:            public String getFrom(CrawlURI curi) {
375:                String res = null;
376:                try {
377:                    res = (String) httpHeaders.getAttribute(ATTR_FROM, curi);
378:                } catch (AttributeNotFoundException e) {
379:                    logger.severe(e.getMessage());
380:                }
381:                return res;
382:            }
383:
384:            /**
385:             * Returns the set number of maximum toe threads.
386:             * @return Number of maximum toe threads
387:             */
388:            public int getMaxToes() {
389:                Integer res = null;
390:                try {
391:                    res = (Integer) getAttribute(null, ATTR_MAX_TOE_THREADS);
392:                } catch (AttributeNotFoundException e) {
393:                    logger.severe(e.getMessage());
394:                }
395:                return res.intValue();
396:            }
397:
398:            /**
399:             * This method gets the RobotsHonoringPolicy object from the orders file.
400:             *
401:             * @return the new RobotsHonoringPolicy
402:             */
403:            public RobotsHonoringPolicy getRobotsHonoringPolicy() {
404:                try {
405:                    return (RobotsHonoringPolicy) getAttribute(null,
406:                            RobotsHonoringPolicy.ATTR_NAME);
407:                } catch (AttributeNotFoundException e) {
408:                    logger.severe(e.getMessage());
409:                    return null;
410:                }
411:            }
412:
413:            /** Get the name of the order file.
414:             *
415:             * @return the name of the order file.
416:             */
417:            public String getCrawlOrderName() {
418:                return getSettingsHandler().getSettingsObject(null).getName();
419:            }
420:
421:            /**
422:             * @return The crawl controller.
423:             */
424:            public CrawlController getController() {
425:                return controller;
426:            }
427:
428:            /**
429:             * @param controller
430:             */
431:            public void setController(CrawlController controller) {
432:                this .controller = controller;
433:            }
434:
435:            /**
436:             * Returns the Map of the StatisticsTracking modules that are included in the
437:             * configuration that the current instance of this class is representing.
438:             * @return Map of the StatisticsTracking modules
439:             */
440:            public MapType getLoggers() {
441:                return loggers;
442:            }
443:
444:            /**
445:             * Checks if the User Agent and From field are set 'correctly' in
446:             * the specified Crawl Order.
447:             *
448:             * @throws FatalConfigurationException
449:             */
450:            public void checkUserAgentAndFrom()
451:                    throws FatalConfigurationException {
452:                // don't start the crawl if they're using the default user-agent
453:                String userAgent = this .getUserAgent(null);
454:                String from = this .getFrom(null);
455:                if (!(userAgent.matches(ACCEPTABLE_USER_AGENT) && from
456:                        .matches(ACCEPTABLE_FROM))) {
457:                    throw new FatalConfigurationException(
458:                            "unacceptable user-agent "
459:                                    + " or from (Reedit your order file).");
460:                }
461:            }
462:
463:            /**
464:             * @return Checkpoint directory.
465:             */
466:            public File getCheckpointsDirectory() {
467:                try {
468:                    return getDirectoryRelativeToDiskPath((String) getAttribute(
469:                            null, CrawlOrder.ATTR_CHECKPOINTS_PATH));
470:                } catch (AttributeNotFoundException e) {
471:                    // TODO Auto-generated catch block
472:                    e.printStackTrace();
473:                    return null;
474:                }
475:            }
476:
477:            private File getDirectoryRelativeToDiskPath(String subpath) {
478:                File disk;
479:                try {
480:                    disk = getSettingsHandler()
481:                            .getPathRelativeToWorkingDirectory(
482:                                    (String) getAttribute(null,
483:                                            CrawlOrder.ATTR_DISK_PATH));
484:                    return new File(disk, subpath);
485:                } catch (AttributeNotFoundException e) {
486:                    // TODO Auto-generated catch block
487:                    e.printStackTrace();
488:                    return null;
489:                }
490:            }
491:
492:            /**
493:             * Return fullpath to the directory named by <code>key</code>
494:             * in settings.
495:             * If directory does not exist, it and all intermediary dirs
496:             * will be created.
497:             * @param key Key to use going to settings.
498:             * @return Full path to directory named by <code>key</code>.
499:             * @throws AttributeNotFoundException
500:             */
501:            public File getSettingsDir(String key)
502:                    throws AttributeNotFoundException {
503:                String path = (String) getAttribute(null, key);
504:                File f = new File(path);
505:                if (!f.isAbsolute()) {
506:                    f = getDirectoryRelativeToDiskPath(path);
507:                }
508:                if (!f.exists()) {
509:                    f.mkdirs();
510:                }
511:                return f;
512:            }
513:
514:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.