Source Code Cross Referenced for LinksScoper.java in » Web-Crawler » heritrix » org » archive » crawler » postprocessor » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler.postprocessor

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /* LinksScoper
002:         * 
003:         * $Id: LinksScoper.java 4911 2007-02-18 19:55:55Z gojomo $
004:         *
005:         * Created on Oct 2, 2003
006:         * 
007:         * Copyright (C) 2003 Internet Archive.
008:         *
009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
010:         *
011:         * Heritrix is free software; you can redistribute it and/or modify
012:         * it under the terms of the GNU Lesser Public License as published by
013:         * the Free Software Foundation; either version 2.1 of the License, or
014:         * any later version.
015:         *
016:         * Heritrix is distributed in the hope that it will be useful,
017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019:         * GNU Lesser Public License for more details.
020:         *
021:         * You should have received a copy of the GNU Lesser Public License
022:         * along with Heritrix; if not, write to the Free Software
023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024:         *
025:         */
026:        package org.archive.crawler.postprocessor;
027:
028:        import java.util.Collection;
029:        import java.util.HashSet;
030:        import java.util.Iterator;
031:        import java.util.logging.Level;
032:        import java.util.logging.Logger;
033:
034:        import javax.management.AttributeNotFoundException;
035:
036:        import org.apache.commons.httpclient.URIException;
037:        import org.archive.crawler.datamodel.CandidateURI;
038:        import org.archive.crawler.datamodel.CrawlURI;
039:        import org.archive.crawler.datamodel.FetchStatusCodes;
040:        import org.archive.crawler.deciderules.DecideRule;
041:        import org.archive.crawler.deciderules.DecideRuleSequence;
042:        import org.archive.crawler.extractor.Link;
043:        import org.archive.crawler.framework.Filter;
044:        import org.archive.crawler.framework.Scoper;
045:        import org.archive.crawler.settings.MapType;
046:        import org.archive.crawler.settings.SimpleType;
047:        import org.archive.crawler.settings.Type;
048:
049:        /**
050:         * Determine which extracted links are within scope.
051:         * TODO: To test scope, requires that Link be converted to
052:         * a CandidateURI.  Make it so don't have to make a CandidateURI to test
053:         * if Link is in scope.
054:         * <p>Since this scoper has to create CandidateURIs, no sense
055:         * discarding them since later in the processing chain CandidateURIs rather
056:         * than Links are whats needed scheduling extracted links w/ the
057:         * Frontier (Frontier#schedule expects CandidateURI, not Link).  This class
058:         * replaces Links w/ the CandidateURI that wraps the Link in the CrawlURI.
059:         *
060:         * @author gojomo
061:         * @author stack
062:         */
063:        public class LinksScoper extends Scoper implements  FetchStatusCodes {
064:
065:            private static final long serialVersionUID = -4074442117992496793L;
066:
067:            private static Logger LOGGER = Logger.getLogger(LinksScoper.class
068:                    .getName());
069:
070:            private final static String ATTR_SEED_REDIRECTS_NEW_SEEDS = "seed-redirects-new-seed";
071:
072:            private final static Boolean DEFAULT_SEED_REDIRECTS_NEW_SEEDS = new Boolean(
073:                    true);
074:
075:            public static final String ATTR_REJECTLOG_DECIDE_RULES = "scope-rejected-url-rules";
076:
077:            public static final String ATTR_PREFERENCE_DEPTH_HOPS = "preference-depth-hops";
078:
079:            private final static Integer DEFAULT_PREFERENCE_DEPTH_HOPS = new Integer(
080:                    -1);
081:
082:            /**
083:             * Instance of rejected uris log filters.
084:             */
085:            private MapType rejectLogFilters = null;
086:
087:            /**
088:             * @param name Name of this filter.
089:             */
090:            public LinksScoper(String name) {
091:                super (name, "LinksScoper. Rules on which extracted links "
092:                        + "are within configured scope.");
093:
094:                Type t;
095:                t = addElementToDefinition(new SimpleType(
096:                        ATTR_SEED_REDIRECTS_NEW_SEEDS,
097:                        "If enabled, any URL found because a seed redirected to it "
098:                                + "(original seed returned 301 or 302), will also be treated "
099:                                + "as a seed.",
100:                        DEFAULT_SEED_REDIRECTS_NEW_SEEDS));
101:                t.setExpertSetting(true);
102:
103:                t = addElementToDefinition(new SimpleType(
104:                        ATTR_PREFERENCE_DEPTH_HOPS,
105:                        "Number of hops (of any sort) from a seed up to which a URI has higher "
106:                                + "priority scheduling than any remaining seed. For example, if set to 1 items one "
107:                                + "hop (link, embed, redirect, etc.) away from a seed will be scheduled "
108:                                + "with HIGH priority. If set to -1, no "
109:                                + "preferencing will occur, and a breadth-first search with seeds "
110:                                + "processed before discovered links will proceed. If set to zero, a "
111:                                + "purely depth-first search will proceed, with all discovered links processed "
112:                                + "before remaining seeds.  Seed redirects are treated as one hop from a seed.",
113:                        DEFAULT_PREFERENCE_DEPTH_HOPS));
114:                t.setExpertSetting(true);
115:
116:                addElementToDefinition(new DecideRuleSequence(
117:                        ATTR_REJECTLOG_DECIDE_RULES,
118:                        "DecideRules which, if their final decision on a link is "
119:                                + "not REJECT, cause the otherwise scope-rejected links to "
120:                                + "be logged"));
121:
122:            }
123:
124:            protected void innerProcess(final CrawlURI curi) {
125:                if (LOGGER.isLoggable(Level.FINEST)) {
126:                    LOGGER.finest(getName() + " processing " + curi);
127:                }
128:
129:                // If prerequisites, nothing to be done in here.
130:                if (curi.hasPrerequisiteUri()) {
131:                    handlePrerequisite(curi);
132:                    return;
133:                }
134:
135:                // Don't extract links of error pages.
136:                if (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400) {
137:                    curi.clearOutlinks();
138:                    return;
139:                }
140:
141:                if (curi.outlinksSize() <= 0) {
142:                    // No outlinks to process.
143:                    return;
144:                }
145:
146:                final boolean redirectsNewSeeds = ((Boolean) getUncheckedAttribute(
147:                        curi, ATTR_SEED_REDIRECTS_NEW_SEEDS)).booleanValue();
148:                int preferenceDepthHops = ((Integer) getUncheckedAttribute(
149:                        curi, ATTR_PREFERENCE_DEPTH_HOPS)).intValue();
150:                Collection<CandidateURI> inScopeLinks = new HashSet<CandidateURI>();
151:                for (final Iterator i = curi.getOutObjects().iterator(); i
152:                        .hasNext();) {
153:                    Object o = i.next();
154:                    if (o instanceof  Link) {
155:                        final Link wref = (Link) o;
156:                        try {
157:                            final int directive = getSchedulingFor(curi, wref,
158:                                    preferenceDepthHops);
159:                            final CandidateURI caURI = curi.createCandidateURI(
160:                                    curi.getBaseURI(), wref, directive,
161:                                    considerAsSeed(curi, wref,
162:                                            redirectsNewSeeds));
163:                            if (isInScope(caURI)) {
164:                                inScopeLinks.add(caURI);
165:                            }
166:                        } catch (URIException e) {
167:                            getController().logUriError(e, curi.getUURI(),
168:                                    wref.getDestination().toString());
169:                        }
170:                    } else if (o instanceof  CandidateURI) {
171:                        CandidateURI caURI = (CandidateURI) o;
172:                        if (isInScope(caURI)) {
173:                            inScopeLinks.add(caURI);
174:                        }
175:                    } else {
176:                        LOGGER.severe("Unexpected type: " + o);
177:                    }
178:                }
179:                // Replace current links collection w/ inscopeLinks.  May be
180:                // an empty collection.
181:                curi.replaceOutlinks(inScopeLinks);
182:            }
183:
184:            /**
185:             * The CrawlURI has a prerequisite; apply scoping and update
186:             * Link to CandidateURI in manner analogous to outlink handling. 
187:             * @param curi CrawlURI with prereq to consider
188:             */
189:            protected void handlePrerequisite(CrawlURI curi) {
190:                try {
191:                    // Create prerequisite CandidateURI
192:                    CandidateURI caUri = curi.createCandidateURI(curi
193:                            .getBaseURI(), (Link) curi.getPrerequisiteUri());
194:                    int prereqPriority = curi.getSchedulingDirective() - 1;
195:                    if (prereqPriority < 0) {
196:                        prereqPriority = 0;
197:                        LOGGER.severe("Unable to promote prerequisite " + caUri
198:                                + " above " + curi);
199:                    }
200:                    caUri.setSchedulingDirective(prereqPriority);
201:                    caUri.setForceFetch(true);
202:                    if (isInScope(caUri)) {
203:                        // replace link with CandidateURI
204:                        curi.setPrerequisiteUri(caUri);
205:                    } else {
206:                        // prerequisite is out-of-scope; mark CrawlURI as error,
207:                        // preventinting normal S_DEFERRED handling
208:                        curi
209:                                .setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE);
210:                    }
211:                } catch (URIException ex) {
212:                    Object[] array = { curi, curi.getPrerequisiteUri() };
213:                    getController().uriErrors.log(Level.INFO, ex.getMessage(),
214:                            array);
215:                } catch (NumberFormatException e) {
216:                    // UURI.createUURI will occasionally throw this error.
217:                    Object[] array = { curi, curi.getPrerequisiteUri() };
218:                    getController().uriErrors.log(Level.INFO, e.getMessage(),
219:                            array);
220:                }
221:            }
222:
223:            protected void outOfScope(CandidateURI caUri) {
224:                super .outOfScope(caUri);
225:                if (!LOGGER.isLoggable(Level.INFO)) {
226:                    return;
227:                }
228:                // TODO: Fix filters so work on CandidateURI.
229:                CrawlURI curi = (caUri instanceof  CrawlURI) ? (CrawlURI) caUri
230:                        : new CrawlURI(caUri.getUURI());
231:                if (rulesAccept(getRejectLogRules(curi), curi)) {
232:                    LOGGER.info(curi.getUURI().toString());
233:                }
234:            }
235:
236:            protected DecideRule getRejectLogRules(Object o) {
237:                try {
238:                    return (DecideRule) getAttribute(o,
239:                            ATTR_REJECTLOG_DECIDE_RULES);
240:                } catch (AttributeNotFoundException e) {
241:                    throw new RuntimeException(e);
242:                }
243:            }
244:
245:            private boolean considerAsSeed(final CrawlURI curi,
246:                    final Link wref, final boolean redirectsNewSeeds) {
247:                // Check if this is a seed with a 301 or 302.
248:                if (curi.isSeed()
249:                        && (curi.getFetchStatus() == 301 || curi
250:                                .getFetchStatus() == 302)
251:                        && wref.getHopType() == Link.REFER_HOP) {
252:                    // Check if redirects from seeds should be treated as seeds.
253:                    if (redirectsNewSeeds) {
254:                        return true;
255:                    }
256:                }
257:                return false;
258:            }
259:
260:            /**
261:             * Determine scheduling for the  <code>curi</code>.
262:             * As with the LinksScoper in general, this only handles extracted links,
263:             * seeds do not pass through here, but are given MEDIUM priority.  
264:             * Imports into the frontier similarly do not pass through here, 
265:             * but are given NORMAL priority.
266:             */
267:            protected int getSchedulingFor(final CrawlURI curi,
268:                    final Link wref, final int preferenceDepthHops) {
269:                final char c = wref.getHopType();
270:                if (LOGGER.isLoggable(Level.FINEST)) {
271:                    LOGGER.finest(curi + " with path=" + curi.getPathFromSeed()
272:                            + " isSeed=" + curi.isSeed() + " with fetchStatus="
273:                            + curi.getFetchStatus() + " -> "
274:                            + wref.getDestination() + " type " + c
275:                            + " with context=" + wref.getContext());
276:                }
277:
278:                switch (c) {
279:                case Link.REFER_HOP:
280:                    // Treat redirects somewhat urgently
281:                    // This also ensures seed redirects remain seed priority
282:                    return (preferenceDepthHops >= 0 ? CandidateURI.HIGH
283:                            : CandidateURI.MEDIUM);
284:                default:
285:                    if (preferenceDepthHops == 0)
286:                        return CandidateURI.HIGH;
287:                    // this implies seed redirects are treated as path
288:                    // length 1, which I belive is standard.
289:                    // curi.getPathFromSeed() can never be null here, because
290:                    // we're processing a link extracted from curi
291:                    if (preferenceDepthHops > 0
292:                            && curi.getPathFromSeed().length() + 1 <= preferenceDepthHops)
293:                        return CandidateURI.HIGH;
294:                    // Everything else normal (at least for now)
295:                    return CandidateURI.NORMAL;
296:                }
297:            }
298:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.