001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx.searchengine;
034:
035: import websphinx.*;
036: import java.net.URL;
037: import java.net.URLEncoder;
038: import java.net.MalformedURLException;
039:
040: /**
041: * <A href="http://www.hotbot.com/">HotBot</a> search engine.
042: */
043: public class HotBot implements SearchEngine {
044:
045: static Pattern patTitle = new Regexp("^");
046:
047: static Pattern patCount = new Regexp(
048: "Returned <B>(\\d+)</b> matches");
049: static Pattern patNoHits = new Regexp(
050: "Sorry -- your search yielded no results");
051:
052: // FIX: works only for Netscape
053: static Pattern patResult = new Tagexp(
054: "<td><b>(?{rank})</b></td>" // rank
055: + "<td>(?:<a><img></a>)?" // optional icon
056: + "(?{link}<a>(?{title})</a>)</td>" // title and main link
057: + "</tr><tr><td><font>(?{score})</font></td><td>(?{description})<br></td>" // description
058: );
059:
060: static Pattern patMoreLink = new Tagexp(
061: "<input type=image name=act.next>");
062:
063: /**
064: * Classify a page. Sets the following labels:
065: * <TABLE>
066: * <TR><TH>Name <TH>Type <TH>Meaning
067: * <TR><TD>searchengine.source <TD>Page label <TD>HotBot object that labeled this page
068: * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page
069: * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results. Each result region
070: * contains subfields: rank, title, description, and link.
071: * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.
072: * </TABLE>
073: */
074: public void classify(Page page) {
075: String title = page.getTitle();
076: if (title != null && title.startsWith("HotBot results:")) {
077: page.setObjectLabel("searchengine.source", this );
078:
079: Region count = patCount.oneMatch(page);
080: if (count != null)
081: page
082: .setField("searchengine.count", count
083: .getField("0"));
084:
085: Region[] results = patResult.allMatches(page);
086: //System.err.println ("found " + results.length + " results");
087: SearchEngineResult[] ser = new SearchEngineResult[results.length];
088: for (int i = 0; i < results.length; ++i) {
089: ser[i] = new SearchEngineResult(results[i]);
090: //System.out.println (ser[i]);
091: }
092: page.setFields("searchengine.results", ser);
093:
094: PatternMatcher m = patMoreLink.match(page);
095: while (m.hasMoreElements()) {
096: Link link = (Link) m.nextMatch();
097: link.setLabel("searchengine.more-results");
098: link.setLabel("hyperlink");
099: }
100: } else
101: System.err.println("not a HotBot page");
102:
103: }
104:
105: /**
106: * Priority of this classifier.
107: */
108: public static final float priority = 0.0F;
109:
110: /**
111: * Get priority of this classifier.
112: * @return priority.
113: */
114: public float getPriority() {
115: return priority;
116: }
117:
118: /**
119: * Make a query URL for HotBot.
120: * @param keywords list of keywords, separated by spaces
121: * @return URL that submits the keywords to HotBot.
122: */
123: public URL makeQuery(String keywords) {
124: try {
125: return new URL(
126: "http://www.search.hotbot.com/hResult.html/?SM=MC&MT="
127: + URLEncoder.encode(keywords)
128: + "&DV=7&RG=.com&DC=10&DE=2&OPs=MDRTP&_v=2&DU=days&SW=web");
129: } catch (MalformedURLException e) {
130: throw new RuntimeException("internal error");
131: }
132: }
133:
134: /**
135: * Get number of results per page for this search engine.
136: * @return typical number of results per page
137: */
138: public int getResultsPerPage() {
139: return 10;
140: }
141:
142: /**
143: * Search HotBot.
144: * @param keywords list of keywords, separated by spaces
145: * @return enumeration of SearchEngineResults returned by a HotBot query constructed from the keywords.
146: */
147: public static Search search(String keywords) {
148: return new Search(new HotBot(), keywords);
149: }
150:
151: /**
152: * Search HotBot.
153: * @param keywords list of keywords, separated by spaces
154: * @param maxResults maximum number of results to return
155: * @return enumeration of SearchEngineResults returned by an HotBot query constructed from the keywords.
156: * The enumeration yields at most maxResults objects.
157: */
158: public static Search search(String keywords, int maxResults) {
159: return new Search(new HotBot(), keywords, maxResults);
160: }
161: }
|