01: /*
02: * Copyright 2005 by Lars Torunski
03: *
04: * Licensed under the Apache License, Version 2.0 (the "License");
05: * you may not use this file except in compliance with the License.
06: * You may obtain a copy of the License at
07: *
08: * http://www.apache.org/licenses/LICENSE-2.0
09: *
10: * Unless required by applicable law or agreed to in writing, software
11: * distributed under the License is distributed on an "AS IS" BASIS,
12: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13: * See the License for the specific language governing permissions and
14: * limitations under the License.
15: *
16: */
17: package com.torunski.crawler.parser;
18:
19: import java.util.Collection;
20:
21: import com.torunski.crawler.filter.ILinkFilter;
22: import com.torunski.crawler.link.Link;
23:
24: /**
25: * Defines an interface for the parsers. With the load method it is possible to download
26: * different pages and to parse them later in a different thread.
27: *
28: * @author Lars Torunski
29: * @version $Revision: 1.6 $
30: */
31: public interface IParser {
32:
33: /**
34: * Loads the data of the URI. A crawler can load different URIs at the same
35: * time and parse them lately. Hence all necessary information have to be stored
36: * in a PageData object. E.g. different threads can download the content of the
37: * URI parallel and parse them in a different order.
38: *
39: * @param link the link of the page
40: * @return the page data of the uri or <code>null</code> if preloading the data failed
41: */
42: PageData load(Link link);
43:
44: /**
45: * Parses a PageData object e.g. for links and returns them in a Collection.
46: *
47: * @param pageData the page data of the page
48: * @param linkFilter the filter for the URIs
49: * @return a collection of new URIs in the pageData filtered by the linkFilter
50: */
51: Collection parse(PageData pageData, ILinkFilter linkFilter);
52:
53: }
|