001: package bplatt.spider;
002:
003: /** Arachnid - Abstract Web spider class
004: * To use, derive class from Arachnid,
005: * Add handleLink(), handleBadLink(), handleNonHTMLlink(),
006: * handleExternalLink(), and handleBadIO() methods
007: * Instantiate and call traverse()
008: *
009: * Copyright 2002, Robert L. Platt, All rights reserved
010: * @author Robert L. Platt
011: *
012: * This program is free software; you can redistribute it and/or modify
013: * it under the terms of the GNU General Public License as published by
014: * the Free Software Foundation; either version 2 of the License, or
015: * (at your option) any later version.
016: *
017: * This program is distributed in the hope that it will be useful,
018: * but WITHOUT ANY WARRANTY; without even the implied warranty of
019: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: * GNU General Public License for more details.
021: *
022: * You should have received a copy of the GNU General Public License
023: * along with this program; if not, write to the Free Software
024: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: */
026:
027: import java.io.*;
028: import java.net.*;
029: import java.util.*;
030:
031: public abstract class Arachnid {
032: private String base;
033: private URL baseUrl;
034: private HashSet visited;
035: private int delay;
036: private static final String HTML = "text/html";
037:
038: /** Constructor */
039: public Arachnid(String base) throws MalformedURLException {
040: this .base = base;
041: baseUrl = new URL(base);
042: visited = new HashSet();
043: delay = 2;
044: }
045:
046: /** Traverse Web site */
047: public void traverse() {
048: traverse(baseUrl, null);
049: }
050:
051: private void traverse(URL url, URL parent) {
052: boolean isHTMLfile = true;
053: PageInfo p = null;
054: try {
055: p = getWebPage(url, parent);
056: } catch (IOException e) {
057: handleBadIO(url, parent);
058: sleep(delay);
059: return;
060: }
061: if (p == null) {
062: handleBadLink(url, parent, null);
063: sleep(delay);
064: return;
065: }
066: if (p.isValid() == false) {
067: if (p.getContentType().equalsIgnoreCase(HTML) == false)
068: handleNonHTMLlink(url, parent, p);
069: else
070: handleBadLink(url, parent, p);
071: sleep(delay);
072: return;
073: } else
074: handleLink(p);
075:
076: // Navigate through links on page
077: URL[] links = p.getLinks();
078: if (links == null) {
079: sleep(delay);
080: return;
081: }
082: int n = links.length;
083: for (int i = 0; i < n; ++i) {
084: if (isOKtoVisit(links[i])) {
085: visited.add(links[i]);
086: traverse(links[i], url);
087: } else if (isExternalSite(links[i]))
088: handleExternalLink(links[i], url);
089: }
090: sleep(delay);
091: return;
092: }
093:
094: /** (Abstract) Handle bad URL */
095: protected abstract void handleBadLink(URL url, URL parent,
096: PageInfo p);
097:
098: /** (Abstract) Handle a link; a Web page in the site */
099: protected abstract void handleLink(PageInfo p);
100:
101: /** (Abstract) Handle a non-HTML link */
102: protected abstract void handleNonHTMLlink(URL url, URL parent,
103: PageInfo p);
104:
105: /** (Abstract) Handle an external (outside of Web site) link */
106: protected abstract void handleExternalLink(URL url, URL parent);
107:
108: /** (Abstract) Handle an I/O Exception (server problem) */
109: protected abstract void handleBadIO(URL url, URL parent);
110:
111: /** Return true if it's OK to visit the link,
112: false if it's not */
113: private boolean isOKtoVisit(URL link) {
114: // Return false if it's not HTTP protocol
115: if (!link.getProtocol().equals("http"))
116: return (false);
117: // Return false if it's an external site
118: else if (isExternalSite(link))
119: return (false);
120: else if (visited.contains(link))
121: return (false);
122: else
123: return (true);
124: }
125:
126: private boolean isExternalSite(URL link) {
127: // Return true if link host is different from base or
128: // if path of link is not a superset of base URL
129: if (link.getAuthority() != baseUrl.getAuthority()
130: || (!UrlPathDir(link).startsWith(UrlPathDir(baseUrl))))
131: return (true);
132: else
133: return (false);
134: }
135:
136: private String UrlPathDir(URL u) {
137: String p = u.getPath();
138: if (p == null || p.equals(""))
139: return ("/");
140: int i = p.lastIndexOf("/");
141: if (i == -1)
142: return ("/");
143: else
144: p = p.substring(0, i + 1);
145: return (p);
146: }
147:
148: // Populate a PageInfo object from a URL
149: private PageInfo getWebPage(URL url, URL parentUrl)
150: throws IOException {
151: HttpURLConnection connection = (HttpURLConnection) url
152: .openConnection();
153: int responseCode = connection.getResponseCode();
154: String contentType = connection.getContentType();
155: // Note: contentLength == -1 if NOT KNOWN (i.e. not returned from server)
156: int contentLength = connection.getContentLength();
157: PageInfo p = new PageInfo(url, parentUrl, contentType,
158: contentLength, responseCode);
159: InputStreamReader rdr = new InputStreamReader(connection
160: .getInputStream());
161: p.extract(rdr);
162: rdr.close();
163: connection.disconnect();
164: return (p);
165: }
166:
167: /** Get contents of a URL */
168: public byte[] getContent(URL url) {
169: byte[] buf = null;
170: try {
171: HttpURLConnection connection = (HttpURLConnection) url
172: .openConnection();
173: int responseCode = connection.getResponseCode();
174: int contentLength = connection.getContentLength();
175: // System.out.println("Content length: "+contentLength);
176: if (responseCode != HttpURLConnection.HTTP_OK
177: || contentLength <= 0)
178: return (null);
179: InputStream in = connection.getInputStream();
180: BufferedInputStream bufIn = new BufferedInputStream(in);
181: buf = new byte[contentLength];
182: // Added code to handle blocked reads
183: int bytesToRead = contentLength;
184: int flag = 10;
185: while (bytesToRead != 0 && flag != 0) {
186: int bytesRead = bufIn.read(buf,
187: (contentLength - bytesToRead), bytesToRead);
188: bytesToRead = bytesToRead - bytesRead;
189: flag--;
190: if (flag <= 5)
191: sleep(1);
192: }
193: in.close();
194: connection.disconnect();
195: if (flag == 0)
196: return (null);
197: } catch (Exception e) {
198: // System.out.println(e);
199: // e.printStackTrace();
200: return (null);
201: }
202:
203: return (buf);
204: }
205:
206: /** Return base URL (starting point for Web traversal) */
207: public URL getBaseUrl() {
208: return (baseUrl);
209: }
210:
211: // Sleep N seconds
212: private void sleep(int n) {
213: if (n <= 0)
214: return;
215: Thread mythread = Thread.currentThread();
216: try {
217: mythread.sleep(n * 1000);
218: } catch (InterruptedException e) { // Ignore
219: }
220: }
221:
222: /**
223: * Returns delay (N second pause after processing EACH web page)
224: * @return int
225: */
226: public int getDelay() {
227: return delay;
228: }
229:
230: /**
231: * Sets delay (N second pause after processing EACH web page)
232: * @param delay The delay to set
233: */
234: public void setDelay(int delay) {
235: this.delay = delay;
236: }
237:
238: }
|