Source Code Cross Referenced for Processor.java in » Web-Crawler » heritrix » org » archive » crawler » framework » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.framework

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /* Copyright (C) 2003 Internet Archive.
002:         *
003:         * This file is part of the Heritrix web crawler (crawler.archive.org).
004:         *
005:         * Heritrix is free software; you can redistribute it and/or modify
006:         * it under the terms of the GNU Lesser Public License as published by
007:         * the Free Software Foundation; either version 2.1 of the License, or
008:         * any later version.
009:         *
010:         * Heritrix is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013:         * GNU Lesser Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser Public License
016:         * along with Heritrix; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         *
019:         * Processor.java
020:         * Created on Apr 16, 2003
021:         *
022:         * $Header$
023:         */
024:        package org.archive.crawler.framework;
025:
026:        import java.lang.reflect.Constructor;
027:        import java.util.Iterator;
028:        import java.util.logging.Level;
029:        import java.util.logging.Logger;
030:
031:        import javax.management.AttributeNotFoundException;
032:
033:        import org.archive.crawler.datamodel.CrawlURI;
034:        import org.archive.crawler.deciderules.DecideRule;
035:        import org.archive.crawler.deciderules.DecideRuleSequence;
036:        import org.archive.crawler.settings.MapType;
037:        import org.archive.crawler.settings.ModuleType;
038:        import org.archive.crawler.settings.SimpleType;
039:
040:        /**
041:         * Base class for URI processing classes.
042:         *
043:         * <p> Each URI is processed by a user defined series of processors. This class
044:         * provides the basic infrastructure for these but does not actually do
045:         * anything. New processors can be easily created by subclassing this class.
046:         *
047:         * <p> Classes subclassing this one should not trap InterruptedExceptions.
048:         * They should be allowed to propagate to the ToeThread executing the processor.
049:         * Also they should immediately exit their main method (<tt>innerProcess()</tt>)
050:         * if the <tt>interrupted</tt> flag is set.
051:         *
052:         * @author Gordon Mohr
053:         *
054:         * @see org.archive.crawler.framework.ToeThread
055:         */
056:        public class Processor extends ModuleType {
057:
058:            private static final long serialVersionUID = 6248563827413710226L;
059:
060:            /**
061:             * Key to use asking settings for decide-rules value.
062:             */
063:            public static final String ATTR_DECIDE_RULES = "decide-rules";
064:            /** local name for decide-rules */
065:            protected String attrDecideRules;
066:
067:            /**
068:             * Key to use asking settings for enabled value.
069:             */
070:            public final static String ATTR_ENABLED = "enabled";
071:
072:            private Processor defaultNextProcessor = null;
073:
074:            private static Logger logger = Logger
075:                    .getLogger("org.archive.crawler.framework.Processor");
076:
077:            /**
078:             * @param name
079:             * @param description
080:             */
081:            public Processor(String name, String description) {
082:                super (name, description);
083:                addElementToDefinition(new SimpleType(ATTR_ENABLED,
084:                        "Is processor enabled", new Boolean(true)));
085:                attrDecideRules = getName() + "#" + ATTR_DECIDE_RULES;
086:                addElementToDefinition(new DecideRuleSequence(attrDecideRules,
087:                        "DecideRules which, if their final decision is REJECT, "
088:                                + "prevent this Processor from running."));
089:            }
090:
091:            /**
092:             * Perform processing on the given CrawlURI.
093:             *
094:             * @param curi
095:             * @throws InterruptedException
096:             */
097:            public final void process(CrawlURI curi)
098:                    throws InterruptedException {
099:                // by default, arrange for curi to proceed to next processor
100:                curi.setNextProcessor(getDefaultNextProcessor(curi));
101:
102:                // Check if this processor is enabled before processing
103:                try {
104:                    if (!((Boolean) getAttribute(ATTR_ENABLED, curi))
105:                            .booleanValue()) {
106:                        return;
107:                    }
108:                } catch (AttributeNotFoundException e) {
109:                    logger.severe(e.getMessage());
110:                }
111:
112:                if (rulesAccept(curi)) {
113:                    innerProcess(curi);
114:                } else {
115:                    innerRejectProcess(curi);
116:                }
117:            }
118:
119:            protected void checkForInterrupt() throws InterruptedException {
120:                if (Thread.interrupted()) {
121:                    throw new InterruptedException("interrupted");
122:                }
123:            }
124:
125:            /**
126:             * @param curi CrawlURI instance.
127:             * @throws InterruptedException
128:             */
129:            protected void innerRejectProcess(CrawlURI curi)
130:                    throws InterruptedException {
131:                // by default do nothing
132:            }
133:
134:            /**
135:             * Classes subclassing this one should override this method to perform
136:             * their custom actions on the CrawlURI.
137:             *
138:             * @param curi The CrawlURI being processed.
139:             * @throws InterruptedException
140:             */
141:            protected void innerProcess(CrawlURI curi)
142:                    throws InterruptedException {
143:                // by default do nothing
144:            }
145:
146:            /**
147:             * Classes subclassing this one should override this method to perform
148:             * processor specific actions.
149:             * <p>
150:             *
151:             * This method is garanteed to be called after the crawl is set up, but
152:             * before any URI-processing has occured.
153:             */
154:            protected void initialTasks() {
155:                // by default do nothing
156:            }
157:
158:            /**
159:             * Classes subclassing this one should override this method to perform
160:             * processor specific actions.
161:             *
162:             */
163:            protected void finalTasks() {
164:                // by default do nothing
165:            }
166:
167:            protected DecideRule getDecideRule(Object o) {
168:                try {
169:                    return (DecideRule) getAttribute(o, attrDecideRules);
170:                } catch (AttributeNotFoundException e) {
171:                    throw new RuntimeException(e);
172:                }
173:            }
174:
175:            protected boolean rulesAccept(Object o) {
176:                return rulesAccept(getDecideRule(o), o);
177:            }
178:
179:            protected boolean rulesAccept(DecideRule rule, Object o) {
180:                return rule.decisionFor(o) != DecideRule.REJECT;
181:            }
182:
183:            /**
184:             * Returns the next processor for the given CrawlURI in the processor chain.
185:             * @param curi The CrawlURI that we want to find the next processor for.
186:             * @return The next processor for the given CrawlURI in the processor chain.
187:             */
188:            public Processor getDefaultNextProcessor(CrawlURI curi) {
189:                return defaultNextProcessor;
190:            }
191:
192:            /** Set the default next processor in the chain.
193:             *
194:             * @param nextProcessor the default next processor in the chain.
195:             */
196:            public void setDefaultNextProcessor(Processor nextProcessor) {
197:                defaultNextProcessor = nextProcessor;
198:            }
199:
200:            /** 
201:             * Get the controller object.
202:             *
203:             * @return the controller object.
204:             */
205:            public CrawlController getController() {
206:                return getSettingsHandler().getOrder().getController();
207:            }
208:
209:            public Processor spawn(int serialNum) {
210:                Processor newInst = null;
211:                try {
212:                    Constructor co = getClass().getConstructor(
213:                            new Class[] { String.class });
214:                    newInst = (Processor) co
215:                            .newInstance(new Object[] { getName() + serialNum });
216:                    getParent().setAttribute(newInst);
217:                    newInst.setTransient(true);
218:                } catch (Exception e) {
219:                    // TODO Auto-generated catch block
220:                    e.printStackTrace();
221:                }
222:                return newInst;
223:            }
224:
225:            /**
226:             * Compiles and returns a report (in human readable form) about the status
227:             * of the processor.  The processor's name (of implementing class) should
228:             * always be included.
229:             * <p>
230:             * Examples of stats declared would include:<br>
231:             * * Number of CrawlURIs handled.<br>
232:             * * Number of links extracted (for link extractors)<br>
233:             * etc.
234:             *
235:             * @return A human readable report on the processor's state.
236:             */
237:            public String report() {
238:                return ""; // Default behavior.
239:            }
240:
241:            /**
242:             * @param curi CrawlURI to examine.
243:             * @return True if content to process -- content length is > 0 
244:             * -- and links have not yet been extracted.
245:             */
246:            protected boolean isContentToProcess(CrawlURI curi) {
247:                return !curi.hasBeenLinkExtracted()
248:                        && curi.getContentLength() > 0;
249:            }
250:
251:            /**
252:             * @param curi CrawlURI to examine.
253:             * @return True if {@link #isContentToProcess(CrawlURI)} and
254:             * the CrawlURI represents a successful http transaction.
255:             */
256:            protected boolean isHttpTransactionContentToProcess(CrawlURI curi) {
257:                return isContentToProcess(curi) && curi.isHttpTransaction()
258:                        && curi.isSuccess();
259:            }
260:
261:            /**
262:             * @param contentType Found content type.
263:             * @param expectedPrefix String to find at start of contenttype: e.g.
264:             * <code>text/html</code>.
265:             * @return True if passed content-type begins with
266:             * expected mimetype.
267:             */
268:            protected boolean isExpectedMimeType(String contentType,
269:                    String expectedPrefix) {
270:                return contentType != null
271:                        && contentType.toLowerCase().startsWith(expectedPrefix);
272:            }
273:
274:            public void kickUpdate() {
275:                // by default do nothing
276:            }
277:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.