001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx.searchengine;
034:
035: import websphinx.*;
036: import java.net.URL;
037: import java.net.URLEncoder;
038: import java.net.MalformedURLException;
039:
040: /**
041: * <A href="http://altavista.digital.com/">AltaVista</a> search engine.
042: */
043: public class AltaVista implements SearchEngine {
044:
045: static Pattern patCount = new Regexp(
046: "<font size=-1 face=\"arial, helvetica\">(?:About )?<b>(\\d+)</b> documents? match your query.");
047: static Pattern patNoHits = new Regexp(
048: "No documents match the query.");
049:
050: static Pattern patResult = new Tagexp("<dt><b>(?{rank})</b>" // rank
051: + "(?{link}<a><b>(?{title})</b></a>)" // title and main link
052: + "<dd>(?{description})<br>" // description
053: + "(?:<i>(?:<a></a>)?</i><br>)+" // URL(s)
054: + "<p>" // terminator
055: );
056:
057: static Pattern patMoreLink = new Tagexp(
058: "<input type=image name=navig* value=nav.gif>");
059:
060: /**
061: * Classify a page. Sets the following labels:
062: * <TABLE>
063: * <TR><TH>Name <TH>Type <TH>Meaning
064: * <TR><TD>searchengine.source <TD>Page label <TD>AltaVista object that labeled the page
065: * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page
066: * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results. Each result region
067: * contains subfields: rank, title, description, and link.
068: * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.
069: * </TABLE>
070: */
071: public void classify(Page page) {
072: String title = page.getTitle();
073: if (title != null
074: && (title.startsWith("AltaVista: Simple Query") || title
075: .startsWith("AltaVista: Advanced Query"))) {
076: page.setObjectLabel("searchengine.source", this );
077:
078: Region count = patCount.oneMatch(page);
079: if (count != null)
080: page
081: .setField("searchengine.count", count
082: .getField("0"));
083:
084: Region[] results = patResult.allMatches(page);
085: SearchEngineResult[] ser = new SearchEngineResult[results.length];
086: for (int i = 0; i < results.length; ++i)
087: ser[i] = new SearchEngineResult(results[i]);
088: page.setFields("searchengine.results", ser);
089:
090: PatternMatcher m = patMoreLink.match(page);
091: while (m.hasMoreElements()) {
092: Link link = (Link) m.nextMatch();
093: link.setLabel("searchengine.more-results");
094: link.setLabel("hyperlink");
095: }
096: }
097: }
098:
099: /**
100: * Priority of this classifier.
101: */
102: public static final float priority = 0.0F;
103:
104: /**
105: * Get priority of this classifier.
106: * @return priority.
107: */
108: public float getPriority() {
109: return priority;
110: }
111:
112: /**
113: * Make a query URL for AltaVista.
114: * @param keywords list of keywords, separated by spaces
115: * @return URL that submits the keywords to AltaVista.
116: */
117: public URL makeQuery(String keywords) {
118: try {
119: return new URL(
120: "http://altavista.digital.com/cgi-bin/query?pg=q&what=web&kl=XX&q="
121: + URLEncoder.encode(keywords));
122: } catch (MalformedURLException e) {
123: throw new RuntimeException("internal error");
124: }
125: }
126:
127: /**
128: * Get number of results per page for this search engine.
129: * @return typical number of results per page
130: */
131: public int getResultsPerPage() {
132: return 10;
133: }
134:
135: /**
136: * Search AltaVista.
137: * @param keywords list of keywords, separated by spaces
138: * @return enumeration of SearchEngineResults returned by an AltaVista query constructed from the keywords.
139: */
140: public static Search search(String keywords) {
141: return new Search(new AltaVista(), keywords);
142: }
143:
144: /**
145: * Search AltaVista.
146: * @param keywords list of keywords, separated by spaces
147: * @param maxResults maximum number of results to return
148: * @return enumeration of SearchEngineResults returned by an AltaVista query constructed from the keywords.
149: * The enumeration yields at most maxResults objects.
150: */
151: public static Search search(String keywords, int maxResults) {
152: return new Search(new AltaVista(), keywords, maxResults);
153: }
154: }
|