01: /*
02: * WebSphinx web-crawling toolkit
03: *
04: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
05: * reserved.
06: *
07: * Redistribution and use in source and binary forms, with or without
08: * modification, are permitted provided that the following conditions
09: * are met:
10: *
11: * 1. Redistributions of source code must retain the above copyright
12: * notice, this list of conditions and the following disclaimer.
13: *
14: * 2. Redistributions in binary form must reproduce the above copyright
15: * notice, this list of conditions and the following disclaimer in
16: * the documentation and/or other materials provided with the
17: * distribution.
18: *
19: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
20: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
23: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30: *
31: */
32:
33: package websphinx;
34:
35: /**
36: * Classifier interface. A classifier is a helper object that annotates
37: * pages and links with labels (using Page.setLabel() and Link.setLabel()).
38: * When a page is retrieved by a crawler, it is passed to the classify()
39: * method of every Classifier registered with the crawler. Here are some
40: * typical uses for classifiers:
41: * <UL>
42: * <LI> classifying links into categories like child or parent (see
43: * websphinx.StandardClassifier);
44: * <LI> classifying pages into categories like biology or computers;
45: * <LI> recognizing and parsing pages formatted in a particular style, such as
46: * AltaVista, Yahoo, or latex2html (e.g., the search engine classifiers
47: * in websphinx.searchengine)
48: * <LI>
49: * </UL>
50: */
51: public interface Classifier
52: //#ifdef JDK1.1
53: extends java.io.Serializable
54: //#endif JDK1.1
55: {
56: /**
57: * Classify a page. Typically, the classifier calls page.setLabel() and
58: * page.setField() to mark up the page. The classifier may also look
59: * through the page's links and call link.setLabel() to mark them up.
60: * @param page Page to classify
61: */
62: public abstract void classify(Page page);
63:
64: /**
65: * Get priority of this classifier. Lower priorities execute first.
66: * A classifier should also define a public constant <CODE>priority</CODE>
67: * so that classifiers that depend on it can compute their
68: * priorities statically. For example, if your classifier
69: * depends on FooClassifier and BarClassifier, you might set your
70: * priority as:
71: * <PRE>
72: * public static final float priority = Math.max (FooClassifier, BarClassifier) + 1;
73: * public float getPriority () { return priority; }
74: * </PRE>
75: *
76: * @return priority of this classifier
77: */
78: public float getPriority();
79: }
|