001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx;
034:
035: import java.net.URL;
036:
037: /**
038: * Standard classifier, installed in every crawler by default.
039: * <P>On the entire page, this classifier sets the following labels:
040: * <UL>
041: * <LI><B>root</B>: page is the root page of a Web site. For instance,
042: * "http://www.digital.com/" and "http://www.digital.com/index.html" are both
043: * marked as root, but "http://www.digital.com/about" is not.
044: * </UL>
045: * <P>Also sets one or more of the following labels on every link:
046: * <UL>
047: * <LI><B>hyperlink</B>: link is a hyperlink (A, AREA, or FRAME tags) to another page on the Web (using http, file, ftp, or gopher protocols)
048: * <LI><B>image</B>: link is an inline image (IMG).
049: * <LI><B>form</B>: link is a form (FORM tag). A form generally requires some parameters to use.
050: * <LI><B>code</B>: link points to code (APPLET, EMBED, or SCRIPT).
051: * <LI><B>remote</B>: link points to a different Web server.
052: * <LI><B>local</B>: link points to the same Web server.
053: * <LI><B>same-page</B>: link points to the same page (e.g., by an anchor reference like "#top")
054: * <LI><B>sibling</B>: a local link that points to a page in the same directory (e.g. "sibling.html")
055: * <LI><B>descendent</B>: a local link that points downwards in the directory structure (e.g., "deep/deeper/deepest.html")
056: * <LI><B>ancestor</B>: a link that points upwards in the directory structure (e.g., "../..")
057: * </UL>
058: */
059: public class StandardClassifier implements Classifier {
060:
061: /**
062: * Make a StandardClassifier.
063: */
064: public StandardClassifier() {
065: }
066:
067: /**
068: * Classify a page.
069: * @param page Page to classify
070: */
071: // FIX: use regular expressions throughout this method
072: public void classify(Page page) {
073: Link origin = page.getOrigin();
074: String pageHost = origin.getHost();
075: int pagePort = origin.getPort();
076: String pagePath = origin.getFile();
077: String pageFilename = origin.getFilename();
078:
079: URL base = page.getBase();
080: String baseHost = base.getHost();
081: int basePort = base.getPort();
082: String basePath = base.getFile();
083:
084: if (pageFilename.equals("")
085: || pageFilename.startsWith("index.htm"))
086: page.setLabel("root");
087:
088: // FIX: Link needs to resolve "foo/bar/.." and "foo/." to "foo" in order for this
089: // stuff to work properly
090: Link[] links = page.getLinks();
091: if (links != null) {
092: for (int i = 0; i < links.length; ++i) {
093: Link link = links[i];
094:
095: if ((link.getHost().equals(pageHost) && link.getPort() == pagePort)
096: || (link.getHost().equals(baseHost) && link
097: .getPort() == basePort)) {
098: link.setLabel("local");
099:
100: String linkPath = link.getFile();
101:
102: if (linkPath.equals(pagePath)
103: || linkPath.equals(basePath))
104: link.setLabel("same-page");
105: else if (link.getDirectory().equals(
106: origin.getDirectory()))
107: link.setLabel("sibling");
108: else if (descendsFrom(linkPath, pagePath)
109: || descendsFrom(linkPath, basePath))
110: link.setLabel("descendent");
111: else if (descendsFrom(pagePath, linkPath)
112: || descendsFrom(basePath, linkPath))
113: link.setLabel("ancestor");
114: // NIY: child, parent
115: } else
116: link.setLabel("remote");
117:
118: // Link tag kinds: resource, form, hyperlink
119: String tagName = link.getTagName();
120:
121: if (tagName == Tag.IMG)
122: link.setLabel("image");
123: else if (tagName == Tag.APPLET || tagName == Tag.EMBED
124: || tagName == Tag.SCRIPT)
125: link.setLabel("code");
126: else if (tagName == Tag.FORM)
127: link.setLabel("form");
128: else if (tagName == Tag.A || tagName == Tag.AREA
129: || tagName == Tag.FRAME) {
130: String protocol = link.getProtocol();
131:
132: if ((protocol.equals("http")
133: || protocol.equals("ftp")
134: || protocol.equals("file") || protocol
135: .equals("gopher"))
136: && link.getMethod() == Link.GET)
137: link.setLabel("hyperlink");
138: }
139: }
140: }
141: }
142:
143: private boolean descendsFrom(String path1, String path2) {
144: return path1.startsWith(path2.endsWith("/") ? path2 : path2
145: + "/");
146: }
147:
148: /**
149: * Priority of this classifier.
150: */
151: public static final float priority = 0.0F;
152:
153: /**
154: * Get priority of this classifier.
155: * @return priority.
156: */
157: public float getPriority() {
158: return priority;
159: }
160: }
|