MiniCrawler: A skeletal Web crawler : Web Crawler « Network « C# / C Sharp

Home
C# / C Sharp
1.2D Graphics
2.Class Interface
3.Collections Data Structure
4.Components
5.Data Types
6.Database ADO.net
7.Design Patterns
8.Development Class
9.Event
10.File Stream
11.Generics
12.GUI Windows Form
13.Language Basics
14.LINQ
15.Network
16.Office
17.Reflection
18.Regular Expressions
19.Security
20.Services Event
21.Thread
22.Web Services
23.Windows
24.Windows Presentation Foundation
25.XML
26.XML LINQ
C# / C Sharp by API
C# / CSharp Tutorial
C# / CSharp Open Source
C# / C Sharp » Network » Web CrawlerScreenshots 
MiniCrawler: A skeletal Web crawler

/*
C#: The Complete Reference 
by Herbert Schildt 

Publisher: Osborne/McGraw-Hill (March 8, 2002)
ISBN: 0072134852
*/


// MiniCrawler: A skeletal Web crawler. 
 
using System; 
using System.Net; 
using System.IO; 
 
public class MiniCrawler {  
 
  // Find a link in a content string. 
  static string FindLink(string htmlstr,  
                         ref int startloc) { 
    int i; 
    int start, end
    string uri = null
    string lowcasestr = htmlstr.ToLower()
 
    i = lowcasestr.IndexOf("href=\"http", startloc)
    if(i != -1) { 
      start = htmlstr.IndexOf('"', i1
      end = htmlstr.IndexOf('"', start)
      uri = htmlstr.Substring(start, end-start)
      startloc = end
    
             
    return uri; 
  
 
  public static void Main(string[] args) { 
    string link = null
    string str; 
    string answer; 
 
    int curloc; // holds current location in response 
 
    if(args.Length != 1) { 
      Console.WriteLine("Usage: MiniCrawler <uri>")
      return 
    
 
    string uristr = args[0]// holds current URI 
 
    try 
 
      do 
        Console.WriteLine("Linking to " + uristr)
 
        /* Create a WebRequest to the specified URI. */
        HttpWebRequest req = (HttpWebRequest
               WebRequest.Create(uristr)
 
        uristr = null// disallow further use of this URI 
 
        // Send that request and return the response. 
        HttpWebResponse resp = (HttpWebResponse
               req.GetResponse()
 
        // From the response, obtain an input stream. 
        Stream istrm = resp.GetResponseStream()
 
        // Wrap the input stream in a StreamReader. 
        StreamReader rdr = new StreamReader(istrm)
 
        // Read in the entire page. 
        str = rdr.ReadToEnd()
 
        curloc = 0
        
        do 
          // Find the next URI to link to. 
          link = FindLink(str, ref curloc)
 
          if(link != null) { 
            Console.WriteLine("Link found: " + link)
 
            Console.Write("Link, More, Quit?")
            answer = Console.ReadLine()
 
            if(string.Compare(answer, "L"true== 0) { 
              uristr = string.Copy(link)
              break
            else if(string.Compare(answer, "Q"true== 0) { 
              break
            else if(string.Compare(answer, "M"true== 0) { 
              Console.WriteLine("Searching for another link.")
            
          else 
            Console.WriteLine("No link found.")
            break
          
 
        while(link.Length > 0)
 
        // Close the Response. 
        resp.Close()
      while(uristr != null)
 
    catch(WebException exc) { 
      Console.WriteLine("Network Error: " + exc.Message +  
                        "\nStatus code: " + exc.Status)
    catch(ProtocolViolationException exc) { 
      Console.WriteLine("Protocol Error: " + exc.Message)
    catch(UriFormatException exc) { 
      Console.WriteLine("URI Format Error: " + exc.Message)
    catch(NotSupportedException exc) { 
      Console.WriteLine("Unknown Protocol: " + exc.Message)
    catch(IOException exc) { 
      Console.WriteLine("I/O Error: " + exc.Message)
    
 
    Console.WriteLine("Terminating MiniCrawler.")
  
}


           
       
Related examples in the same category
1.Set the BaseAddress for WebClient
2.Build the DownloadString
3.Download a web page in a thread
4.Output webpage content
5.Create GetResponse from WebRequest
6.Check the ContentType
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.