001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx.searchengine;
034:
035: import websphinx.*;
036: import java.net.URL;
037: import java.net.URLEncoder;
038: import java.net.MalformedURLException;
039:
040: /**
041: * <A href="http://www.newsindex.com/">NewsIndex</a> search engine.
042: */
043: public class NewsIndex implements SearchEngine {
044:
045: static Pattern patCount = new Regexp(
046: "<center>Headlines\\s+\\d+\\s+to\\s+\\d+\\s+of\\s+(\\d+)</center>");
047: static Pattern patNoHits = new Regexp(
048: "No articles were found matching your search criteria");
049:
050: static Pattern patResult = new Tagexp(
051: "<dd>(?{link}(?{title}<a>.*?</a>))" // title and link
052: + "<blockquote><b></b>" // news source
053: + "(?{description})</blockquote>" // description and index date
054: );
055:
056: /**
057: * Classify a page. Sets the following labels:
058: * <TABLE>
059: * <TR><TH>Name <TH>Type <TH>Meaning
060: * <TR><TD>searchengine.source <TD>Page label <TD>NewsIndex object that labeled the page
061: * <TR><TD>searchengine.count <TD>Page field <TD>Number of results on page
062: * <TR><TD>searchengine.results <TD>Page fields <TD>Array of results. Each result region
063: * contains subfields: title, description, and link.
064: * <TR><TD>searchengine.more-results <TD>Link label <TD>Link to a page containing more results.
065: * </TABLE>
066: */
067: public void classify(Page page) {
068: String title = page.getTitle();
069: if (title != null && title.equals("News Index - Results")) {
070: page.setObjectLabel("searchengine.source", this );
071:
072: Region count = patCount.oneMatch(page);
073: if (count != null)
074: page
075: .setField("searchengine.count", count
076: .getField("0"));
077:
078: Region[] results = patResult.allMatches(page);
079: SearchEngineResult[] ser = new SearchEngineResult[results.length];
080: for (int i = 0; i < results.length; ++i)
081: ser[i] = new SearchEngineResult(results[i]);
082: page.setFields("searchengine.results", ser);
083:
084: // find "more" link
085: Link[] links = page.getLinks();
086: for (int i = 0; i < links.length; ++i) {
087: if (links[i].toText().equals("Next 10 Headlines")) {
088: links[i].setLabel("searchengine.more-results");
089: links[i].setLabel("hyperlink");
090: break;
091: }
092: }
093: }
094: }
095:
096: /**
097: * Priority of this classifier.
098: */
099: public static final float priority = 0.0F;
100:
101: /**
102: * Get priority of this classifier.
103: * @return priority.
104: */
105: public float getPriority() {
106: return priority;
107: }
108:
109: /**
110: * Make a query URL for NewsIndex.
111: * @param keywords list of keywords, separated by spaces
112: * @return URL that submits the keywords to NewsIndex.
113: */
114: public URL makeQuery(String keywords) {
115: try {
116: return new URL(
117: "http://www.newsindex.com/cgi-bin/process.cgi?mode=any&query="
118: + URLEncoder.encode(keywords));
119: } catch (MalformedURLException e) {
120: throw new RuntimeException("internal error");
121: }
122: }
123:
124: /**
125: * Get number of results per page for this search engine.
126: * @return typical number of results per page
127: */
128: public int getResultsPerPage() {
129: return 10;
130: }
131:
132: /**
133: * Search NewsIndex.
134: * @param keywords list of keywords, separated by spaces
135: * @return enumeration of SearchEngineResults returned by an NewsIndex query constructed from the keywords.
136: */
137: public static Search search(String keywords) {
138: return new Search(new NewsIndex(), keywords);
139: }
140:
141: /**
142: * Search NewsIndex.
143: * @param keywords list of keywords, separated by spaces
144: * @param maxResults maximum number of results to return
145: * @return enumeration of SearchEngineResults returned by an NewsIndex query constructed from the keywords.
146: * The enumeration yields at most maxResults objects.
147: */
148: public static Search search(String keywords, int maxResults) {
149: return new Search(new NewsIndex(), keywords, maxResults);
150: }
151: }
|