001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx.searchengine; // ALB 12/97
034:
035: import websphinx.*;
036: import java.net.URL;
037: import java.net.URLEncoder;
038: import java.net.MalformedURLException;
039:
040: /**
041: * <A href="http://www.excite.com/">Excite</a> search engine.
042: * @author Adam Berger
043: */
044: public class Excite implements SearchEngine {
045:
046: static Pattern patCount = new Regexp(
047: "<font size=-1 face=\"arial, helvetica\">(?:About )?<b>(\\d+)</b> documents? match your query.");
048: static Pattern patNoHits = new Regexp(
049: "No documents match the query.");
050:
051: static Pattern patResult = new Tagexp(
052: "<FONT COLOR=navy><B>(?{score})</B></FONT>" // score
053: + "<B>(?{link}<a>(?{title})</a>)</B>" // title and main link
054: + "<BR><B><I></I></B>" // URL
055: + "<BR><B><I></I></B>(?{description})" // Summary
056: + "<BR><B><I></I></B><a><p></a>" // More Like This
057: );
058:
059: static Pattern patMoreLink = new Tagexp(
060: "<INPUT TYPE=submit NAME=next VALUE=\"Next Results\">");
061:
062: /**
063: * Classify a page. Sets the following labels:
064: * <TABLE>
065: * <TR><TH>Name <TH>Type <TH>Meaning
066: * <TR><TD>searchengine.source <TD>Page label <TD>Excite object that labeled the page
067: * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page
068: * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results. Each result region
069: * contains subfields: score, title, description, and link.
070: * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.
071: * </TABLE>
072: */
073: public void classify(Page page) {
074: String title = page.getTitle();
075: if (title != null
076: && (title.startsWith("Excite Search Results"))) {
077: page.setObjectLabel("searchengine.source", this );
078:
079: Region count = patCount.oneMatch(page);
080: if (count != null)
081: page
082: .setField("searchengine.count", count
083: .getField("0"));
084:
085: Region[] results = patResult.allMatches(page);
086: SearchEngineResult[] ser = new SearchEngineResult[results.length];
087: for (int i = 0; i < results.length; ++i)
088: ser[i] = new SearchEngineResult(results[i]);
089: page.setFields("searchengine.results", ser);
090:
091: PatternMatcher m = patMoreLink.match(page);
092: while (m.hasMoreElements()) {
093: Link link = (Link) m.nextMatch();
094: link.setLabel("searchengine.more-results");
095: link.setLabel("hyperlink");
096: }
097: }
098: }
099:
100: /**
101: * Priority of this classifier.
102: */
103: public static final float priority = 0.0F;
104:
105: /**
106: * Get priority of this classifier.
107: * @return priority.
108: */
109: public float getPriority() {
110: return priority;
111: }
112:
113: /**
114: * Make a query URL for Excite.
115: * @param keywords list of keywords, separated by spaces
116: * @return URL that submits the keywords to Excite.
117: */
118: public URL makeQuery(String keywords) {
119: try {
120: return new URL(
121: "http://www.excite.com/search.gw?trace=a&search="
122: + URLEncoder.encode(keywords));
123: } catch (MalformedURLException e) {
124: throw new RuntimeException("internal error");
125: }
126: }
127:
128: /**
129: * Get number of results per page for this search engine.
130: * @return typical number of results per page
131: */
132: public int getResultsPerPage() {
133: return 10;
134: }
135:
136: /**
137: * Search Excite.
138: * @param keywords list of keywords, separated by spaces
139: * @return enumeration of SearchEngineResults returned by an Excite query constructed from the keywords.
140: */
141: public static Search search(String keywords) {
142: return new Search(new Excite(), keywords);
143: }
144:
145: /**
146: * Search Excite.
147: * @param keywords list of keywords, separated by spaces
148: * @param maxResults maximum number of results to return
149: * @return enumeration of SearchEngineResults returned by an Excite query constructed from the keywords.
150: * The enumeration yields at most maxResults objects.
151: */
152: public static Search search(String keywords, int maxResults) {
153: return new Search(new Excite(), keywords, maxResults);
154: }
155: }
|