Source Code Cross Referenced for Crawler.java in » Search-Engine » BDDBot » bdd » search » spider » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Search Engine » BDDBot » bdd.search.spider

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        package bdd.search.spider;
002:
003:        import java.net.URL;
004:        import java.net.MalformedURLException;
005:        import java.io.File;
006:        import java.io.FileInputStream;
007:        import java.io.DataInputStream;
008:        import java.util.Hashtable;
009:        import bdd.search.EnginePrefs;
010:        import bdd.search.Monitor;
011:        import bdd.util.FIFOQueue;
012:
013:        /** Written by Tim Macinta 1997                           <br>
014:         *  Distributed under the GNU Public License
015:         *       (a copy of which is enclosed with the source).   <br>
016:         *                                                        <br> 
017:         *  Calling the Crawler's start() method will cause the Crawler to
018:         *  index all of the sites in its queue and then replace the main
019:         *  index with the updated index when it completes.  The Crawler's
020:         *  queue should be filled with the starting URLs before calling
021:         *  start().
022:         */
023:        public class Crawler extends Thread {
024:
025:            File working_dir; // directory for temp files
026:            Indexer indexer; // handles post-crawl indexing
027:            FIFOQueue q = new FIFOQueue(); // url queue
028:            Hashtable urls_done = new Hashtable(40); // keeps track of what
029:            //                                           urls are already processed
030:            EnginePrefs eng_prefs; // preferences
031:            boolean exit_when_done = false; // exit when done indexing
032:
033:            /** "working_dir" should be a directory that only this
034:             *  Crawler and a given Indexer will be
035:             *  accessing.  This means that if several Crawlers are running
036:             *  simultaneously, they should all be given different "working_dir"
037:             *  directories.  Also, no other threads should write to this
038:             *  directory (except for the selected Indexer).
039:             */
040:            public Crawler(File working_dir, EnginePrefs eng_prefs) {
041:                this .eng_prefs = eng_prefs;
042:                this .working_dir = working_dir;
043:                indexer = new Indexer(working_dir, this , eng_prefs);
044:                indexer.start();
045:            }
046:
047:            /** Takes "url_to_queue" and adds it to this Crawler's queue of URLs.
048:             *  This method should be used to add all of the desired starting URLs to
049:             *  the queue before the Crawler is started.  If the URL has already
050:             *  been processed or if it is an unallowed URL it is not added.
051:             */
052:            public void addURL(URL url_to_queue) {
053:                if (!eng_prefs.URLAllowed(url_to_queue))
054:                    return; // check if URL is allowed
055:                if (eng_prefs.URLNotIndexable(url_to_queue))
056:                    return; //don't index non-text
057:                url_to_queue = simplify(url_to_queue); // remove loops/anchors
058:                if (urls_done.put(url_to_queue, url_to_queue) == null) {
059:                    q.addElement(url_to_queue); // add if not done already
060:                    Monitor m = eng_prefs.getMonitor();
061:                    if (m != null)
062:                        m.indexing(url_to_queue);
063:                }
064:            }
065:
066:            /** Takes "url" and removes all references to "/./" and "/../" .  This
067:             *  can be used to help eliminate looping.  Also removes all anchors
068:             *  (i.e., everything after and including a '#'). */
069:            URL simplify(URL url) {
070:                String file = url.getFile();
071:                boolean changed = false; // keep track of whether we change anything
072:
073:                // collapse all occurances of "/./"
074:
075:                int i = file.indexOf("/./");
076:                while (i >= 0) {
077:                    changed = true;
078:                    file = file.substring(0, i) + file.substring(i + 2);
079:                    i = file.indexOf("/./");
080:                }
081:
082:                // collapse all occurances of "/../" (by removing preceding directory)
083:
084:                i = file.indexOf("/../");
085:                while (i >= 0) {
086:                    changed = true;
087:                    int i2 = file.lastIndexOf('/', i - 1);
088:                    if (i2 < 0)
089:                        i2 = i;
090:                    file = file.substring(0, i2) + file.substring(i + 3);
091:                    i = file.indexOf("/../");
092:                }
093:
094:                // remove anchor if necessary
095:
096:                if (url.getRef() != null)
097:                    changed = true;
098:
099:                // set port if it's not set already
100:
101:                int port = url.getPort();
102:                String proto = url.getProtocol().toLowerCase();
103:                if (port < 0 && proto.equals("http")) {
104:                    changed = true;
105:                    port = 80;
106:                }
107:
108:                // create a new URL if anything changed
109:
110:                if (changed) {
111:                    try {
112:                        url = new URL(proto, url.getHost(), port, file);
113:                    } catch (MalformedURLException e) {
114:                        e.printStackTrace();
115:                    }
116:                }
117:                return url;
118:            }
119:
120:            /** This is where the actual crawling occurs. */
121:            public void run() {
122:                if (!q.hasMoreElements())
123:                    return; // return if there's nothing to do
124:                int tmp_file = 0; // used to generate unique temporary filenames
125:
126:                URLStatus url_status;
127:                while (true) {
128:                    url_status = new URLStatus((URL) q.nextElement(), new File(
129:                            working_dir, tmp_file + ".tmp"), eng_prefs);
130:                    tmp_file++;
131:                    url_status.readContent();
132:                    if (url_status.loaded()) {
133:                        indexer.queueURL(url_status);
134:                    } else if (url_status.moved()) {
135:                        addURL(url_status.actual_url);
136:                    } else {
137:                        Monitor m = eng_prefs.getMonitor();
138:                        if (m != null)
139:                            m.reportError(url_status.actual_url);
140:                    }
141:                    if (q.hasMoreElements()) {
142:                        eng_prefs.pauseBetweenURLs();
143:                    } else {
144:                        while (!q.hasMoreElements()
145:                                && indexer.q.hasMoreElements()) {
146:                            eng_prefs.pauseBetweenURLs();
147:                        }
148:                        if (!q.hasMoreElements()) {
149:                            break;
150:                        }
151:                    }
152:                }
153:                Monitor m = eng_prefs.getMonitor();
154:                if (m != null)
155:                    m.crawlerDone(this );
156:                indexer.stopWhenDone(exit_when_done);
157:            }
158:
159:            /** This is the method that is called when this class is invoked from
160:             *  the command line.  calling this method will cause a Crawler to be
161:             *  created and started with the starting URLs being listed in a file
162:             *  specified by the first argument (arg[0]).  The file listing the URLs
163:             *  should contain only the URLs with each URL on a line by itself.  Blank
164:             *  lines are allowed and lines beginning with "#" are considered comments
165:             *  and are ignored.
166:             */
167:            public static void main(String arg[]) {
168:                if (arg.length < 1)
169:                    return;
170:                main(new File(arg[0]), new EnginePrefs(), true);
171:            }
172:
173:            public static void main(File file, EnginePrefs prefs) {
174:                main(file, prefs, false);
175:            }
176:
177:            public static void main(File file, EnginePrefs prefs, boolean exit) {
178:                Crawler cr = new Crawler(prefs.getWorkingDir(), prefs);
179:                try {
180:                    DataInputStream in = new DataInputStream(
181:                            new FileInputStream(file));
182:                    String line = in.readLine();
183:                    while (line != null) {
184:                        line = line.trim();
185:                        if (!line.equals("") && !line.startsWith("#")) {
186:                            try {
187:                                cr.addURL(new URL(line));
188:                            } catch (MalformedURLException e2) {
189:                                e2.printStackTrace();
190:                            }
191:                        }
192:                        line = in.readLine();
193:                    }
194:                    cr.exit_when_done = exit;
195:                    cr.start();
196:                } catch (Exception e) {
197:                    e.printStackTrace();
198:                }
199:            }
200:
201:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.