001: /*
002: * This program is free software; you can redistribute it and/or
003: * modify it under the terms of the GNU General Public License
004: * as published by the Free Software Foundation; either version 2
005: * of the License, or (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011:
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
015: */
016: package net.sf.jftp.tools;
017:
018: import java.io.BufferedInputStream;
019: import java.io.BufferedOutputStream;
020: import java.io.BufferedReader;
021: import java.io.BufferedWriter;
022: import java.io.DataInputStream;
023: import java.io.File;
024: import java.io.FileOutputStream;
025: import java.io.InputStreamReader;
026: import java.io.OutputStreamWriter;
027: import java.net.Socket;
028: import java.util.Enumeration;
029: import java.util.Hashtable;
030: import java.util.StringTokenizer;
031: import java.util.Vector;
032:
033: import net.sf.jftp.system.LocalIO;
034: import net.sf.jftp.system.logging.Log;
035:
036: public class FileSearch {
037:
038: private int currentDepth = 0;
039: private Hashtable checked = new Hashtable();
040: public static boolean quiet = true;
041: public static boolean ultraquiet = false;
042:
043: String localDir = ".";
044: int MAX = 999999;
045: int MIN_TERM = 1;
046: int MIN_FACTOR = 1;
047: boolean LOAD = false;
048: String[] typeArray = { "" };
049: String[] termArray = { "" };
050: String[] optArray = { "" };
051: String[] ignoreArray = { "" };
052: String[] scanArray = { "" };
053:
054: public static void main(String argv[]) {
055: String[] typeArray = { ".gz", ".bz2", ".zip", ".rar" };
056: String[] termArray = { "linux", "kernel" };
057: String[] optArray = { "download", "file", "mirror", "location" };
058: String[] ignoreArray = { ".gif", ".jpg", ".png", ".swf",
059: ".jar", ".class", ".google." };
060: String[] scanArray = { ".html", ".htm", "/", ".jsp", ".jhtml",
061: ".phtml", ".asp", ".xml", ".js", ".cgi" };
062: String url = "http://www.google.de/search?hl=de&q=";
063:
064: for (int i = 0; i < termArray.length; i++) {
065: url += termArray[i] + "+";
066: }
067:
068: FileSearch search = new FileSearch();
069:
070: search.typeArray = typeArray;
071: search.termArray = termArray;
072: search.optArray = optArray;
073: search.ignoreArray = ignoreArray;
074: search.scanArray = scanArray;
075: search.MIN_TERM = 1;
076:
077: search.spider(url);
078:
079: }
080:
081: private void spider(String url) {
082: try {
083: if (url.indexOf("/") < 0) {
084: url = url + "/";
085: }
086:
087: url = clear(url);
088:
089: Log.out(">>> URL: " + url);
090: Log.out(">>> Scanning for ");
091:
092: for (int i = 0; i < typeArray.length; i++) {
093: Log.out(typeArray[i] + " ");
094: }
095:
096: Log.out("");
097:
098: Log.out("Fetching initial HTML file...");
099:
100: Getter urlGetter = new Getter(localDir);
101: urlGetter.fetch(url, true);
102:
103: Log.out("Searching for links...");
104: LocalIO.pause(500);
105:
106: crawl(url);
107: } catch (Exception ex) {
108: ex.printStackTrace();
109: }
110: }
111:
112: private String clear(String url) {
113: int idx = url.indexOf("http://");
114:
115: if (idx >= 0) {
116: url = url.substring(7);
117: }
118:
119: return url;
120: }
121:
122: private Vector addVector(Vector v, Vector x) {
123: Enumeration e = x.elements();
124:
125: while (e.hasMoreElements()) {
126: String next = (String) e.nextElement();
127: v.add(next);
128: }
129:
130: return v;
131: }
132:
133: private int rate(String content) {
134: int score = 0;
135:
136: for (int i = 0; i < termArray.length; i++) {
137: if (content.indexOf(termArray[i]) >= 0)
138: score += 3;
139: }
140:
141: if (score < MIN_TERM)
142: return 0;
143:
144: for (int i = 0; i < optArray.length; i++) {
145: if (content.indexOf(optArray[i]) >= 0)
146: score++;
147: }
148:
149: return score;
150: }
151:
152: private int checkForResult(String url) {
153: //for(int i=0; i<typeArray.length; i++) {
154: // if(url.indexOf(typeArray[i]) >= 0) return 2;
155: //}
156:
157: for (int i = 0; i < ignoreArray.length; i++) {
158: if (url.indexOf(ignoreArray[i]) >= 0)
159: return -1;
160: }
161:
162: if (!checkForScanableUrl(url))
163: return -1;
164:
165: return 1;
166: }
167:
168: private boolean checkForScanableUrl(String url) {
169:
170: if (checked.containsKey(url)) {
171: return false;
172: } else {
173: checked.put(url, "");
174: }
175:
176: if (url.indexOf("/") > 0) {
177: String tmp = url.substring(0, url.indexOf("/"));
178: }
179:
180: for (int i = 0; i < scanArray.length; i++) {
181: if (url.endsWith(scanArray[i]))
182: return true;
183: }
184:
185: return false;
186: }
187:
188: private void crawl(String url) throws Exception {
189: url = clear(url);
190:
191: int urlRating = checkForResult(url);
192: if (!quiet)
193: Log.out("URL-Rating: " + url + " -> " + urlRating + " @"
194: + currentDepth);
195:
196: if (urlRating > 0) {
197: //System.out.println("!!!");
198: //Getter.chill(1000);
199: //System.exit(0);
200: } else if (urlRating < 0 && currentDepth > 0) {
201: if (!quiet)
202: Log.out("SKIP " + url);
203: return;
204: }
205:
206: Getter urlGetter = new Getter(localDir);
207: String content = urlGetter.fetch(url);
208:
209: int factor = rate(content);
210: if (!quiet)
211: Log.out("Content-Rating: " + url + " -> " + factor + " @"
212: + currentDepth);
213:
214: if (factor < MIN_FACTOR) {
215: if (!quiet)
216: Log.out("DROP: " + url);
217: return;
218: }
219:
220: if (!ultraquiet)
221: Log.out("Url: " + url + " -> " + urlRating + ":" + factor
222: + "@" + currentDepth);
223:
224: Vector m = sort(content,
225: url.substring(0, url.lastIndexOf("/")), "href=\"");
226: m = addVector(m, sort(content, url.substring(0, url
227: .lastIndexOf("/")), "src=\""));
228: m = addVector(m, sort(content, url.substring(0, url
229: .lastIndexOf("/")), "HREF=\""));
230: m = addVector(m, sort(content, url.substring(0, url
231: .lastIndexOf("/")), "SRC=\""));
232:
233: Enumeration links = m.elements();
234:
235: while (links.hasMoreElements()) {
236:
237: String next = (String) links.nextElement();
238:
239: if (!quiet)
240: Log.out("PROCESS: " + next);
241: boolean skip = false;
242:
243: while (!skip) {
244: for (int i = 0; i < typeArray.length; i++) {
245: if (next.endsWith(typeArray[i])
246: || typeArray[i].trim().equals("*")) {
247: Log.out("HIT: " + url + " -> " + next);
248: //Getter.chill(2000);
249:
250: if (!LOAD || !checkForScanableUrl(url))
251: continue;
252:
253: int x = next.indexOf("/");
254:
255: if ((x > 0)
256: && (next.substring(0, x).indexOf(".") > 0)) {
257: Getter urlGetter2 = new Getter(localDir);
258: urlGetter2.fetch(next, false);
259:
260: continue;
261: }
262: }
263: }
264:
265: skip = true;
266: }
267:
268: if (currentDepth < MAX) {
269:
270: int x = next.indexOf("/");
271:
272: if ((x > 0) && (next.substring(0, x).indexOf(".") > 0)) {
273: currentDepth++;
274: crawl(next);
275: currentDepth--;
276: }
277: }
278: }
279: }
280:
281: private Vector sort(String content, String url, String index) {
282: Vector res = new Vector();
283: int wo = 0;
284:
285: while (true) {
286: wo = content.indexOf(index);
287:
288: if (wo < 0) {
289: return res;
290: }
291:
292: content = content.substring(wo + index.length());
293:
294: String was = content.substring(0, content.indexOf("\""));
295:
296: was = createAbsoluteUrl(was, url);
297: res.add(was);
298: if (!quiet)
299: Log.out("ADD: " + was);
300: }
301: }
302:
303: private String[] check(String auswahl) {
304: StringTokenizer tokenizer = new StringTokenizer(auswahl, "-",
305: false);
306: String[] strArr = new String[tokenizer.countTokens()];
307: int tmp = 0;
308:
309: while (tokenizer.hasMoreElements()) {
310: strArr[tmp] = (String) tokenizer.nextElement();
311: tmp++;
312: }
313:
314: return strArr;
315: }
316:
317: private String createAbsoluteUrl(String newLink, String baseUrl) {
318: newLink = clear(newLink);
319:
320: if (newLink.startsWith(baseUrl)) {
321: return newLink;
322: }
323:
324: if (newLink.startsWith("/") && (baseUrl.indexOf("/") > 0)) {
325: newLink = baseUrl.substring(0, baseUrl.indexOf("/"))
326: + newLink;
327: } else if (newLink.startsWith("/")
328: && (baseUrl.indexOf("/") < 0)) {
329: newLink = baseUrl + newLink;
330: } else if ((newLink.indexOf(".") > 0)) {
331: int idx = newLink.indexOf("/");
332: String tmp = "";
333:
334: if (idx >= 0) {
335: tmp = newLink.substring(0, idx);
336: }
337:
338: if ((tmp.indexOf(".") > 0)) {
339: return clear(newLink);
340: }
341:
342: if (baseUrl.endsWith("/")) {
343: newLink = baseUrl + newLink;
344: } else {
345: newLink = baseUrl + "/" + newLink;
346: }
347: }
348:
349: //Log.out("-> " + newLink);
350:
351: return newLink;
352: }
353:
354: }
355:
356: class Getter {
357: private String localDir = null;
358:
359: public Getter(String localDir) {
360: this .localDir = localDir;
361: }
362:
363: public String fetch(String url) {
364: try {
365: String host = url.substring(0, url.indexOf("/"));
366: String wo = url.substring(url.indexOf("/"));
367: String result = "";
368:
369: //Log.out(">> " + host + wo);
370:
371: Socket deal = new Socket(host, 80);
372: deal.setSoTimeout(5000);
373:
374: BufferedWriter out = new BufferedWriter(
375: new OutputStreamWriter(deal.getOutputStream()));
376: BufferedReader in = new BufferedReader(
377: new InputStreamReader(deal.getInputStream()));
378:
379: out.write("GET http://" + url + " HTTP/1.0\n\n");
380: out.flush();
381:
382: int len = 0;
383:
384: while (!in.ready() && (len < 5000)) {
385: chill(100);
386: len += 100;
387: }
388:
389: while (in.ready()) {
390: result = result + in.readLine();
391: }
392:
393: out.close();
394: in.close();
395:
396: return result;
397: } catch (Exception ex) {
398: if (!FileSearch.quiet)
399: ex.printStackTrace();
400: }
401:
402: return "";
403: }
404:
405: public void fetch(String url, boolean force) {
406: try {
407: String host = url.substring(0, url.indexOf("/"));
408: String wo = url.substring(url.indexOf("/"));
409: String result = "";
410:
411: if (!FileSearch.quiet)
412: Log.debug(">>> " + host + wo);
413:
414: //JFtp.statusP.jftp.ensureLogging();
415: File d = new File(localDir);
416: d.mkdir();
417:
418: File f = new File(localDir
419: + wo.substring(wo.lastIndexOf("/") + 1));
420:
421: if (f.exists() && !force) {
422: if (!FileSearch.quiet)
423: Log.debug(">>> file already exists...");
424:
425: return;
426: } else {
427: f.delete();
428: }
429:
430: Socket deal = new Socket(host, 80);
431: BufferedWriter out = new BufferedWriter(
432: new OutputStreamWriter(deal.getOutputStream()));
433: DataInputStream in = new DataInputStream(
434: new BufferedInputStream(deal.getInputStream()));
435:
436: BufferedOutputStream localOut = new BufferedOutputStream(
437: new FileOutputStream(localDir
438: + wo.substring(wo.lastIndexOf("/") + 1)));
439:
440: byte[] alu = new byte[2048];
441:
442: out.write("GET http://" + url + " HTTP/1.0\n\n");
443: out.flush();
444:
445: boolean line = true;
446: boolean bin = false;
447:
448: while (true) {
449: chill(10);
450:
451: String tmp = "";
452:
453: while (line) {
454: String x = in.readLine();
455:
456: if (x == null) {
457: break;
458: }
459:
460: tmp += (x + "\n");
461:
462: if (x.equals("")) {
463: line = false;
464: }
465: }
466:
467: int x = in.read(alu);
468:
469: if (x == -1) {
470: if (line) {
471: localOut.write(tmp.getBytes(), 0, tmp.length());
472: }
473:
474: out.close();
475: in.close();
476: localOut.flush();
477: localOut.close();
478:
479: return;
480: } else {
481: localOut.write(alu, 0, x);
482: }
483: }
484: } catch (Exception ex) {
485: if (!FileSearch.quiet)
486: ex.printStackTrace();
487: }
488: }
489:
490: public static void chill(int time) {
491: try {
492: Thread.sleep(time);
493: } catch (Exception ex) {
494: }
495: }
496: }
|