001: /*
002: * WebSphinx web-crawling toolkit
003: *
004: * Copyright (c) 1998-2002 Carnegie Mellon University. All rights
005: * reserved.
006: *
007: * Redistribution and use in source and binary forms, with or without
008: * modification, are permitted provided that the following conditions
009: * are met:
010: *
011: * 1. Redistributions of source code must retain the above copyright
012: * notice, this list of conditions and the following disclaimer.
013: *
014: * 2. Redistributions in binary form must reproduce the above copyright
015: * notice, this list of conditions and the following disclaimer in
016: * the documentation and/or other materials provided with the
017: * distribution.
018: *
019: * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020: * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021: * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023: * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024: * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025: * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026: * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027: * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029: * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030: *
031: */
032:
033: package websphinx.workbench;
034:
035: import websphinx.*;
036: import java.io.File;
037: import java.io.IOException;
038:
039: public class ExtractAction implements Action, CrawlListener {
040: Pattern pattern;
041: String filename;
042: boolean useBrowser;
043: boolean textOnly;
044:
045: transient File file;
046: transient RecordTransformer records;
047: transient boolean noFields;
048:
049: public ExtractAction(Pattern pattern, boolean useBrowser,
050: String filename, boolean textOnly) {
051: this .pattern = pattern;
052: this .filename = filename;
053: this .useBrowser = useBrowser;
054: this .textOnly = textOnly;
055: }
056:
057: public boolean equals(Object object) {
058: if (!(object instanceof ExtractAction))
059: return false;
060: ExtractAction a = (ExtractAction) object;
061: return same(a.filename, filename) && a.useBrowser == useBrowser
062: && a.pattern.equals(pattern) && a.textOnly == textOnly;
063: }
064:
065: private boolean same(String s1, String s2) {
066: if (s1 == null || s2 == null)
067: return s1 == s2;
068: else
069: return s1.equals(s2);
070: }
071:
072: public Pattern getPattern() {
073: return pattern;
074: }
075:
076: public boolean getUseBrowser() {
077: return useBrowser;
078: }
079:
080: public String getFilename() {
081: return filename;
082: }
083:
084: public boolean getTextOnly() {
085: return textOnly;
086: }
087:
088: public void connected(Crawler crawler) {
089: crawler.addCrawlListener(this );
090: }
091:
092: public void disconnected(Crawler crawler) {
093: crawler.removeCrawlListener(this );
094: }
095:
096: private void showit() {
097: Browser browser = Context.getBrowser();
098: if (browser != null)
099: browser.show(file);
100: }
101:
102: public synchronized void visit(Page page) {
103: try {
104: int n = 0;
105:
106: PatternMatcher m = pattern.match(page);
107: for (Region r = m.nextMatch(); r != null; r = m.nextMatch()) {
108: Object[] fields;
109: if (noFields) {
110: fields = new Object[1];
111: fields[0] = r;
112: } else
113: fields = (Object[]) r.getFields(Pattern.groups);
114:
115: records.writeRecord(fields, textOnly);
116: ++n;
117: }
118:
119: if (n > 0)
120: records.flush();
121: } catch (IOException e) {
122: throw new RuntimeException(e.toString());
123: }
124: }
125:
126: /**
127: * Notify that the crawler started.
128: */
129: public synchronized void started(CrawlEvent event) {
130: if (records == null) {
131: try {
132: file = (filename != null) ? new File(filename) : Access
133: .getAccess().makeTemporaryFile("extract",
134: ".html");
135:
136: records = new RecordTransformer(file.toString());
137:
138: String[] fieldNames = pattern.getFieldNames();
139: noFields = (fieldNames.length == 0);
140: records.setProlog(records.getProlog()
141: + makeTableHeader(fieldNames));
142: } catch (IOException e) {
143: System.err.println(e); // FIX: use GUI when available
144: }
145: }
146: }
147:
148: private String makeTableHeader(String[] fieldNames) {
149: String result = "<TR>\n<TH>\n";
150: if (fieldNames.length == 0)
151: result += "<TH>\n";
152: else
153: for (int i = 0; i < fieldNames.length; ++i)
154: result += "<TH>" + fieldNames[i] + "\n";
155: return result;
156: }
157:
158: /**
159: * Notify that the crawler ran out of links to crawl
160: */
161: public synchronized void stopped(CrawlEvent event) {
162: try {
163: if (records != null) {
164: records.close();
165: records = null;
166: if (useBrowser)
167: showit();
168: }
169: } catch (IOException e) {
170: System.err.println(e); // FIX: use GUI when available
171: }
172: }
173:
174: /**
175: * Notify that the crawler's state was cleared.
176: */
177: public synchronized void cleared(CrawlEvent event) {
178: try {
179: if (records != null) {
180: records.close();
181: records = null;
182: if (useBrowser)
183: showit();
184: }
185: } catch (IOException e) {
186: System.err.println(e); // FIX: use GUI when available
187: }
188: }
189:
190: /**
191: * Notify that the crawler timed out.
192: */
193: public synchronized void timedOut(CrawlEvent event) {
194: try {
195: records.close();
196: records = null;
197: if (useBrowser)
198: showit();
199: } catch (IOException e) {
200: System.err.println(e); // FIX: use GUI when available
201: }
202: }
203:
204: /**
205: * Notify that the crawler is paused.
206: */
207: public synchronized void paused(CrawlEvent event) {
208: try {
209: records.flush();
210: if (useBrowser)
211: showit();
212: } catch (IOException e) {
213: System.err.println(e); // FIX: use GUI when available
214: }
215: }
216: }
|