001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx.searchengine;
034:
035: import websphinx.*;
036: import java.util.Vector;
037: import java.util.Enumeration;
038: import java.util.NoSuchElementException;
039:
040: public class Search extends Crawler implements Enumeration {
041:
042: int maxResults;
043: int walkedResults; // approximate number of results walked to
044:
045: Vector results = new Vector(); // vector of SearchEngineResults
046: int nextResult = 0; // next result to be returned by the enumeration
047: int approxCount = -1; // (approximate) total number of results
048: boolean crawling = false;
049:
050: public Search() {
051: this (Integer.MAX_VALUE);
052: }
053:
054: public Search(int maxResults) {
055: this .maxResults = maxResults;
056: setDepthFirst(false);
057: setMaxDepth(Integer.MAX_VALUE);
058: EventLog.monitor(this ); // FIX: debugging only
059: }
060:
061: public Search(SearchEngine engine, String keywords, int maxResults) {
062: this (maxResults);
063: addQuery(engine, keywords);
064: search();
065: }
066:
067: public Search(SearchEngine engine, String keywords) {
068: this (engine, keywords, Integer.MAX_VALUE);
069: }
070:
071: public void addQuery(SearchEngine engine, String keywords) {
072: addRoot(new Link(engine.makeQuery(keywords)));
073: addClassifier(engine);
074: walkedResults += engine.getResultsPerPage();
075: }
076:
077: public void search() {
078: crawling = true;
079: Thread thread = new Thread(this , "Search");
080: thread.setDaemon(true);
081: thread.start();
082: }
083:
084: public int count() {
085: synchronized (results) {
086: // block until count is ready
087: try {
088: while (approxCount == -1 && crawling)
089: results.wait();
090: } catch (InterruptedException e) {
091: }
092: return approxCount;
093: }
094: }
095:
096: public boolean hasMoreElements() {
097: synchronized (results) {
098: try {
099: while (nextResult >= results.size() && crawling)
100: results.wait();
101: } catch (InterruptedException e) {
102: }
103:
104: return nextResult < results.size();
105: }
106: }
107:
108: public Object nextElement() {
109: return nextResult();
110: }
111:
112: public SearchEngineResult nextResult() {
113: if (!hasMoreElements())
114: throw new NoSuchElementException();
115: synchronized (results) {
116: SearchEngineResult result = (SearchEngineResult) results
117: .elementAt(nextResult++);
118: if (result.rank == 0)
119: result.rank = nextResult;
120: return result;
121: }
122: }
123:
124: public void run() {
125: super .run();
126: synchronized (results) {
127: if (approxCount == -1)
128: approxCount = 0;
129: crawling = false;
130: results.notify();
131: }
132: }
133:
134: public void visit(Page page) {
135: synchronized (results) {
136: if (approxCount == -1)
137: approxCount = page.getNumericLabel(
138: "searchengine.count", new Integer(0))
139: .intValue();
140:
141: Region[] ser = page.getFields("searchengine.results");
142: for (int i = 0; i < ser.length; ++i) {
143: if (results.size() == maxResults) {
144: stop();
145: return;
146: }
147: results.addElement(ser[i]);
148: }
149: results.notify();
150: }
151: }
152:
153: public boolean shouldVisit(Link link) {
154: if (walkedResults >= maxResults
155: || !link.hasLabel("searchengine.more-results"))
156: return false;
157: SearchEngine engine = (SearchEngine) link.getSource()
158: .getObjectLabel("searchengine.source");
159: walkedResults += engine.getResultsPerPage();
160: return true;
161: }
162:
163: public static void main(String[] args) throws Exception {
164: if (args.length == 0) {
165: System.err
166: .println("Search <search engine classname> [-max n] <keywords>*");
167: return;
168: }
169:
170: SearchEngine engine = (SearchEngine) Class.forName(args[0])
171: .newInstance();
172:
173: int max = Integer.MAX_VALUE;
174: int firstKeyword = 1;
175: if (args[1].equals("-max")) {
176: max = Integer.parseInt(args[2]);
177: firstKeyword = 3;
178: }
179:
180: Search ms = new Search(max);
181: ms.addQuery(engine, concat(args, firstKeyword));
182: ms.search();
183: while (ms.hasMoreElements())
184: System.out.println(ms.nextResult());
185: }
186:
187: static String concat(String[] args, int start) {
188: StringBuffer buf = new StringBuffer();
189: for (int i = start; i < args.length; ++i) {
190: if (buf.length() > 0)
191: buf.append(' ');
192: buf.append(args[i]);
193: }
194: return buf.toString();
195: }
196:
197: }
|