001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx.searchengine;
034:
035: import websphinx.*;
036: import java.net.URL;
037: import java.net.URLEncoder;
038: import java.net.MalformedURLException;
039:
040: /**
041: * <A href="http://www.newbot.com/">NewsBot</a> search engine.
042: */
043: public class NewsBot implements SearchEngine {
044:
045: static Pattern patTitle = new Regexp("^");
046:
047: static Pattern patCount = new Regexp(
048: "Returned <B>(\\d+)</b> results");
049: static Pattern patNoHits = new Regexp(
050: "Sorry -- your search yielded no results");
051:
052: // FIX: works only for Netscape
053: static Pattern patResult = new Tagexp("<font>"
054: + "(?{link}<A>(?{title})</A>)" + "</font>" + "<br>"
055: + "<font></font>(?{description})<br>"
056: + "<font><b></b></font><p>");
057:
058: static Pattern patMoreLink = new Tagexp(
059: "<input type=image name=act.next>");
060:
061: /**
062: * Classify a page. Sets the following labels:
063: * <TABLE>
064: * <TR><TH>Name <TH>Type <TH>Meaning
065: * <TR><TD>searchengine.source <TD>Page label <TD>NewsBot object that labeled this page
066: * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page
067: * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results. Each result region
068: * contains subfields: rank, title, description, and link.
069: * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.
070: * </TABLE>
071: */
072: public void classify(Page page) {
073: String title = page.getTitle();
074: if (title != null && title.startsWith("HotBot results:")) {
075: page.setObjectLabel("searchengine.source", this );
076:
077: Region count = patCount.oneMatch(page);
078: if (count != null)
079: page
080: .setField("searchengine.count", count
081: .getField("0"));
082:
083: Region[] results = patResult.allMatches(page);
084: SearchEngineResult[] ser = new SearchEngineResult[results.length];
085: for (int i = 0; i < results.length; ++i) {
086: ser[i] = new SearchEngineResult(results[i]);
087: //System.out.println (ser[i]);
088: }
089: page.setFields("searchengine.results", ser);
090:
091: PatternMatcher m = patMoreLink.match(page);
092: while (m.hasMoreElements()) {
093: Link link = (Link) m.nextMatch();
094: link.setLabel("searchengine.more-results");
095: link.setLabel("hyperlink");
096: }
097: } else
098: System.err.println("not a NewsBot page");
099:
100: }
101:
102: /**
103: * Priority of this classifier.
104: */
105: public static final float priority = 0.0F;
106:
107: /**
108: * Get priority of this classifier.
109: * @return priority.
110: */
111: public float getPriority() {
112: return priority;
113: }
114:
115: /**
116: * Make a query URL for NewsBot.
117: * @param keywords list of keywords, separated by spaces
118: * @return URL that submits the keywords to NewsBot.
119: */
120: public URL makeQuery(String keywords) {
121: try {
122: java.util.StringTokenizer tok = new java.util.StringTokenizer(
123: keywords);
124: StringBuffer output = new StringBuffer();
125: while (tok.hasMoreElements()) {
126: String kw = tok.nextToken();
127: if (output.length() > 0)
128: output.append(" or ");
129: output.append(kw);
130: }
131:
132: return new URL(
133: "http://engine.newbot.com/newbot/server/query.fpl?client_id=0sQaJNoAahXc&output=hotbot4&logad=1&client_sw=html&client_vr=0.9&client_last_updated=ignore&T0=hotbot&S0=date&P0=&F0=24&Q0="
134: + URLEncoder.encode(output.toString())
135: + "&max_results=50&S0=rank&Search.x=55&Search.y=4");
136: } catch (MalformedURLException e) {
137: throw new RuntimeException("internal error");
138: }
139: }
140:
141: /**
142: * Get number of results per page for this search engine.
143: * @return typical number of results per page
144: */
145: public int getResultsPerPage() {
146: return 10;
147: }
148:
149: /**
150: * Search NewsBot.
151: * @param keywords list of keywords, separated by spaces
152: * @return enumeration of SearchEngineResults returned by a NewsBot query constructed from the keywords.
153: */
154: public static Search search(String keywords) {
155: return new Search(new NewsBot(), keywords);
156: }
157:
158: /**
159: * Search NewsBot.
160: * @param keywords list of keywords, separated by spaces
161: * @param maxResults maximum number of results to return
162: * @return enumeration of SearchEngineResults returned by an NewsBot query constructed from the keywords.
163: * The enumeration yields at most maxResults objects.
164: */
165: public static Search search(String keywords, int maxResults) {
166: return new Search(new NewsBot(), keywords, maxResults);
167: }
168: }
|