HtmlWeb.cs :  » Network-Clients » iReaper » HtmlAgilityPack » C# / CSharp Open Source

Home
C# / CSharp Open Source
1.2.6.4 mono .net core
2.2.6.4 mono core
3.Aspect Oriented Frameworks
4.Bloggers
5.Build Systems
6.Business Application
7.Charting Reporting Tools
8.Chat Servers
9.Code Coverage Tools
10.Content Management Systems CMS
11.CRM ERP
12.Database
13.Development
14.Email
15.Forum
16.Game
17.GIS
18.GUI
19.IDEs
20.Installers Generators
21.Inversion of Control Dependency Injection
22.Issue Tracking
23.Logging Tools
24.Message
25.Mobile
26.Network Clients
27.Network Servers
28.Office
29.PDF
30.Persistence Frameworks
31.Portals
32.Profilers
33.Project Management
34.RSS RDF
35.Rule Engines
36.Script
37.Search Engines
38.Sound Audio
39.Source Control
40.SQL Clients
41.Template Engines
42.Testing
43.UML
44.Web Frameworks
45.Web Service
46.Web Testing
47.Wiki Engines
48.Windows Presentation Foundation
49.Workflows
50.XML Parsers
C# / C Sharp
C# / C Sharp by API
C# / CSharp Tutorial
C# / CSharp Open Source » Network Clients » iReaper 
iReaper » HtmlAgilityPack » HtmlWeb.cs
// HtmlAgilityPack V1.0 - Simon Mourier <simon underscore mourier at hotmail dot com>
using System;
using System.IO;
using System.Net;
using System.Xml;
using System.Xml.Serialization;
using System.Xml.Xsl;
using Microsoft.Win32;

namespace HtmlAgilityPack{
  /// <summary>
  /// A utility class to get HTML document from HTTP.
  /// </summary>
  public class HtmlWeb
  {
    /// <summary>
    /// Represents the method that will handle the PreRequest event.
    /// </summary>
    public delegate bool PreRequestHandler(HttpWebRequest request);

    /// <summary>
    /// Represents the method that will handle the PostResponse event.
    /// </summary>
    public delegate void PostResponseHandler(HttpWebRequest request, HttpWebResponse response);

    /// <summary>
    /// Represents the method that will handle the PreHandleDocument event.
    /// </summary>
    public delegate void PreHandleDocumentHandler(HtmlDocument document);

    private int _streamBufferSize = 1024;
    private string _cachePath;
    private bool _usingCache;
    private bool _fromCache;
    private bool _cacheOnly;
    private bool _useCookies;
    private int _requestDuration;
    private bool _autoDetectEncoding = true;
    private HttpStatusCode _statusCode = HttpStatusCode.OK;
    private Uri _responseUri;

    /// <summary>
    /// Occurs before an HTTP request is executed.
    /// </summary>
    public PreRequestHandler PreRequest;

    /// <summary>
    /// Occurs after an HTTP request has been executed.
    /// </summary>
    public PostResponseHandler PostResponse;

    /// <summary>
    /// Occurs before an HTML document is handled.
    /// </summary>
    public PreHandleDocumentHandler PreHandleDocument;

    /// <summary>
    /// Creates an instance of an HtmlWeb class.
    /// </summary>
    public HtmlWeb()
    {
    }

    /// <summary>
    /// Gets an HTML document from an Internet resource and saves it to the specified file.
    /// </summary>
    /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
    /// <param name="path">The location of the file where you want to save the document.</param>
    public void Get(string url, string path)
    {
      Get(url, path, "GET");
    }
      
    /// <summary>
    /// Gets an HTML document from an Internet resource and saves it to the specified file.
    /// </summary>
    /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
    /// <param name="path">The location of the file where you want to save the document.</param>
    /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
    public void Get(string url, string path, string method)
    {
      Uri uri = new Uri(url);
      if ((uri.Scheme == Uri.UriSchemeHttps) ||
        (uri.Scheme == Uri.UriSchemeHttp))
      {
        Get(uri, method, path, null);
      }
      else
      {
        throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'.");
      }
    }

    /// <summary>
    /// Gets an HTML document from an Internet resource.
    /// </summary>
    /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
    /// <returns>A new HTML document.</returns>
    public HtmlDocument Load(string url)
    {
      return Load(url, "GET");
    }

    /// <summary>
    /// Loads an HTML document from an Internet resource.
    /// </summary>
    /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
    /// <param name="method">The HTTP method used to open the connection, such as GET, POST, PUT, or PROPFIND.</param>
    /// <returns>A new HTML document.</returns>
    public HtmlDocument Load(string url, string method)
    {
      Uri uri = new Uri(url);
      HtmlDocument doc;
      if ((uri.Scheme == Uri.UriSchemeHttps) ||
        (uri.Scheme == Uri.UriSchemeHttp))
      {
        doc = LoadUrl(uri, method);
      }
      else
      {

        if (uri.Scheme == Uri.UriSchemeFile)
        {
          doc = new HtmlDocument();
          doc.OptionAutoCloseOnEnd = false;
          doc.OptionAutoCloseOnEnd = true;
          doc.DetectEncodingAndLoad(url, _autoDetectEncoding);
        }
        else
        {
          throw new HtmlWebException("Unsupported uri scheme: '" + uri.Scheme + "'.");
        }
      }
      if (PreHandleDocument != null)
      {
        PreHandleDocument(doc);
      }
      return doc;
    }

    private bool IsCacheHtmlContent(string path)
    {
      string ct = GetContentTypeForExtension(Path.GetExtension(path), null);
      return IsHtmlContent(ct);
    }

    private bool IsHtmlContent(string contentType)
    {
      return contentType.ToLower().StartsWith("text/html");
    }

    private string GetCacheHeadersPath(Uri uri)
    {
      //return Path.Combine(GetCachePath(uri), ".h.xml");
      return GetCachePath(uri) + ".h.xml";
    }

    /// <summary>
    /// Gets the cache file path for a specified url.
    /// </summary>
    /// <param name="uri">The url fo which to retrieve the cache path. May not be null.</param>
    /// <returns>The cache file path.</returns>
    public string GetCachePath(Uri uri)
    {
      if (uri == null)
      {
        throw new ArgumentNullException("uri");
      }
      if (!UsingCache)
      {
        throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
      }
      string cachePath;
      if (uri.AbsolutePath == "/")
      {
        cachePath = Path.Combine(_cachePath, ".htm");
      }
      else
      {
        cachePath = Path.Combine(_cachePath, (uri.Host + uri.AbsolutePath).Replace('/', '\\'));
      }
      return cachePath;
    }

    /// <summary>
    /// Gets a value indicating if the last document was retrieved from the cache.
    /// </summary>
    public bool FromCache
    {
      get
      {
        return _fromCache;
      }
    }

    /// <summary>
    /// Gets the URI of the Internet resource that actually responded to the request.
    /// </summary>
    public Uri ResponseUri
    {
      get
      {
        return _responseUri;
      }
    }

    /// <summary>
    /// Gets or Sets a value indicating whether to get document only from the cache.
    /// If this is set to true and document is not found in the cache, nothing will be loaded.
    /// </summary>
    public bool CacheOnly
    {
      get
      {
        return _cacheOnly;
      }
      set
      {
        if ((value) && !UsingCache)
        {
          throw new HtmlWebException("Cache is not enabled. Set UsingCache to true first.");
        }
        _cacheOnly = value;
      }
    }

    /// <summary>
    /// Gets or Sets a value indicating if cookies will be stored.
    /// </summary>
    public bool UseCookies
    {
      get
      {
        return _useCookies;
      }
      set
      {
        _useCookies = value;
      }
    }

    /// <summary>
    /// Gets the last request duration in milliseconds.
    /// </summary>
    public int RequestDuration
    {
      get
      {
        return _requestDuration;
      }
    }

    /// <summary>
    /// Gets or Sets a value indicating if document encoding must be automatically detected.
    /// </summary>
    public bool AutoDetectEncoding
    {
      get
      {
        return _autoDetectEncoding;
      }
      set
      {
        _autoDetectEncoding = value;
      }
    }

    /// <summary>
    /// Gets the last request status.
    /// </summary>
    public HttpStatusCode StatusCode
    {
      get
      {
        return _statusCode;
      }
    }

    /// <summary>
    /// Gets or Sets the size of the buffer used for memory operations.
    /// </summary>
    public int StreamBufferSize
    {
      get
      {
        return _streamBufferSize;
      }
      set
      {
        if (_streamBufferSize <= 0)
        {
          throw new ArgumentException("Size must be greater than zero.");
        }
        _streamBufferSize = value;
      }
    }

    private HtmlDocument LoadUrl(Uri uri, string method)
    {
      HtmlDocument doc = new HtmlDocument();
      doc.OptionAutoCloseOnEnd = false;
      doc.OptionFixNestedTags = true;
      _statusCode = Get(uri, method, null, doc);
      if (_statusCode == HttpStatusCode.NotModified)
      {
        // read cached encoding
        doc.DetectEncodingAndLoad(GetCachePath(uri));
      }
      return doc;
    }

    private HttpStatusCode Get(Uri uri, string method, string path, HtmlDocument doc)
    {
      string cachePath = null;
      HttpWebRequest req;
      bool oldFile = false;

      req = WebRequest.Create(uri) as HttpWebRequest;
      req.Method = method;

      _fromCache = false;
      _requestDuration = 0;
      int tc = Environment.TickCount;
      if (UsingCache)
      {
        cachePath = GetCachePath(req.RequestUri);
        if (File.Exists(cachePath))
        {
          req.IfModifiedSince = File.GetLastAccessTime(cachePath);
          oldFile = true;
        }
      }

      if (_cacheOnly)
      {
        if (!File.Exists(cachePath))
        {
          throw new HtmlWebException("File was not found at cache path: '" + cachePath + "'");
        }

        if (path != null)
        {
          IOLibrary.CopyAlways(cachePath, path);
          // touch the file
          File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
        }
        _fromCache = true;
        return HttpStatusCode.NotModified;
      }

      if (_useCookies)
      {
        req.CookieContainer = new CookieContainer();
      }

      if (PreRequest != null)
      {
        // allow our user to change the request at will
        if (!PreRequest(req))
        {
          return HttpStatusCode.ResetContent;
        }

        // dump cookie
//        if (_useCookies)
//        {
//          foreach(Cookie cookie in req.CookieContainer.GetCookies(req.RequestUri))
//          {
//            HtmlLibrary.Trace("Cookie " + cookie.Name + "=" + cookie.Value + " path=" + cookie.Path + " domain=" + cookie.Domain);
//          }
//        }
      }

      HttpWebResponse resp;

      try
      {
        resp = req.GetResponse() as HttpWebResponse;
      }
      catch (WebException we)
      {
        _requestDuration = Environment.TickCount - tc;
        resp = (HttpWebResponse)we.Response;
        if (resp == null)
        {
          if (oldFile)
          {
            if (path != null)
            {
              IOLibrary.CopyAlways(cachePath, path);
              // touch the file
              File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
            }
            return HttpStatusCode.NotModified;
          }
          throw;
        }
      }
      catch(Exception)
      {
        _requestDuration = Environment.TickCount - tc;
        throw;
      }

      // allow our user to get some info from the response
      if (PostResponse != null)
      {
        PostResponse(req, resp);
      }

      _requestDuration = Environment.TickCount - tc;
      _responseUri = resp.ResponseUri;
      
      bool html = IsHtmlContent(resp.ContentType);
      System.Text.Encoding respenc;

      if ((resp.ContentEncoding != null) && (resp.ContentEncoding.Length>0))
      {
        respenc = System.Text.Encoding.GetEncoding(resp.ContentEncoding);
      }
      else
      {
        respenc = null;
      }

      if (resp.StatusCode == HttpStatusCode.NotModified)
      {
        if (UsingCache)
        {
          _fromCache = true;
          if (path != null)
          {
            IOLibrary.CopyAlways(cachePath, path);
            // touch the file
            File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
          }
          return resp.StatusCode;
        }
        else
        {
          // this should *never* happen...
          throw new HtmlWebException("Server has send a NotModifed code, without cache enabled.");
        }
      }
      Stream s = resp.GetResponseStream();
      if (s != null)
      {
        if (UsingCache)
        {
          // NOTE: LastModified does not contain milliseconds, so we remove them to the file
          SaveStream(s, cachePath, RemoveMilliseconds(resp.LastModified), _streamBufferSize);

          // save headers
          SaveCacheHeaders(req.RequestUri, resp);

          if (path != null)
          {
            // copy and touch the file
            IOLibrary.CopyAlways(cachePath, path);
            File.SetLastWriteTime(path, File.GetLastWriteTime(cachePath));
          }
        }
        else
        {
          // try to work in-memory
          if ((doc != null) && (html))
          {
            if (respenc != null)
            {
              doc.Load(s, respenc);
            }
            else
            {
              doc.Load(s, true);
            }
          }
        }
        resp.Close();
      }
      return resp.StatusCode;
    }

    private string GetCacheHeader(Uri requestUri, string name, string def)
    {
      // note: some headers are collection (ex: www-authenticate)
      // we don't handle that here
      XmlDocument doc = new XmlDocument();
      doc.Load(GetCacheHeadersPath(requestUri));
      XmlNode node = doc.SelectSingleNode("//h[translate(@n, 'abcdefghijklmnopqrstuvwxyz','ABCDEFGHIJKLMNOPQRSTUVWXYZ')='" + name.ToUpper() + "']");
      if (node == null)
      {
        return def;
      }
      // attribute should exist
      return node.Attributes[name].Value;
    }

    private void SaveCacheHeaders(Uri requestUri, HttpWebResponse resp)
    {
      // we cache the original headers aside the cached document.
      string file = GetCacheHeadersPath(requestUri);
      XmlDocument doc = new XmlDocument();
      doc.LoadXml("<c></c>");
      XmlNode cache = doc.FirstChild;
      foreach(string header in resp.Headers)
      {
        XmlNode entry = doc.CreateElement("h");
        XmlAttribute att = doc.CreateAttribute("n");
        att.Value = header;
        entry.Attributes.Append(att);

        att = doc.CreateAttribute("v");
        att.Value = resp.Headers[header];
        entry.Attributes.Append(att);

        cache.AppendChild(entry);
      }
      doc.Save(file);
    }

    private static long SaveStream(Stream stream, string path, DateTime touchDate, int streamBufferSize)
    {
      FilePreparePath(path);
      FileStream fs = new FileStream(path, FileMode.Create, FileAccess.Write);
      BinaryReader br = null;
      BinaryWriter bw = null;
      long len;
      try
      {
        br = new BinaryReader(stream);
        bw = new BinaryWriter(fs);
        len = 0;
        byte[] buffer;
        do
        {
          buffer = br.ReadBytes(streamBufferSize);
          len += buffer.Length;
          if (buffer.Length>0)
          {
            bw.Write(buffer);
          }
        }
        while (buffer.Length>0);
      }
      finally
      {
        if (br != null)
        {
          br.Close();
        }
        if (bw != null)
        {
          bw.Flush();
          bw.Close();
        }
        if (fs != null)
        {
          fs.Close();
        }
      }
      File.SetLastWriteTime(path, touchDate);
      return len;
    }

    private static void FilePreparePath(string target)
    {
      if (File.Exists(target))
      {
        FileAttributes atts = File.GetAttributes(target);
        File.SetAttributes(target, atts & ~FileAttributes.ReadOnly);
      }
      else
      {
        string dir = Path.GetDirectoryName(target);
        if (!Directory.Exists(dir))
        {
          Directory.CreateDirectory(dir);
        }
      }
    }

    private static DateTime RemoveMilliseconds(DateTime t)
    {
      return new DateTime(t.Year, t.Month, t.Day, t.Hour, t.Minute, t.Second, 0);
    }

    /// <summary>
    /// Gets the path extension for a given MIME content type.
    /// </summary>
    /// <param name="contentType">The input MIME content type.</param>
    /// <param name="def">The default path extension to return if any error occurs.</param>
    /// <returns>The MIME content type's path extension.</returns>
    public static string GetExtensionForContentType(string contentType, string def)
    {
      if ((contentType == null) || (contentType.Length == 0))
      {
        return def;
      }
      string ext;
      try
      {
        RegistryKey reg = Registry.ClassesRoot;
        reg = reg.OpenSubKey(@"MIME\Database\Content Type\" + contentType, false);
        ext = (string)reg.GetValue("Extension", def);
      }
      catch(Exception)
      {
        ext =  def;
      }
      return ext;
    }

    /// <summary>
    /// Gets the MIME content type for a given path extension.
    /// </summary>
    /// <param name="extension">The input path extension.</param>
    /// <param name="def">The default content type to return if any error occurs.</param>
    /// <returns>The path extention's MIME content type.</returns>
    public static string GetContentTypeForExtension(string extension, string def)
    {
      if ((extension == null) || (extension.Length == 0))
      {
        return def;
      }
      string contentType;
      try
      {
        RegistryKey reg = Registry.ClassesRoot;
        reg = reg.OpenSubKey(extension, false);
        contentType = (string)reg.GetValue("", def);
      }
      catch(Exception)
      {
        contentType =  def;
      }
      return contentType;
    }

    /// <summary>
    /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter.
    /// </summary>
    /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
    /// <param name="writer">The XmlTextWriter to which you want to save.</param>
    public void LoadHtmlAsXml(string htmlUrl, XmlTextWriter writer)
    {
      HtmlDocument doc = Load(htmlUrl);
      doc.Save(writer);
    }

    /// <summary>
    /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
    /// </summary>
    /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
    /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
    /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
    /// <param name="writer">The XmlTextWriter to which you want to save.</param>
    public void LoadHtmlAsXml(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, XmlTextWriter writer)
    {
      LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer, null);
    }

    /// <summary>
    /// Loads an HTML document from an Internet resource and saves it to the specified XmlTextWriter, after an XSLT transformation.
    /// </summary>
    /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp". May not be null.</param>
    /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
    /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
    /// <param name="writer">The XmlTextWriter to which you want to save.</param>
    /// <param name="xmlPath">A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.</param>
    public void LoadHtmlAsXml(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, XmlTextWriter writer, string xmlPath)
    {
      if (htmlUrl == null)
      {
        throw new ArgumentNullException("htmlUrl");
      }

      HtmlDocument doc = Load(htmlUrl);

      if (xmlPath != null)
      {
        XmlTextWriter w = new XmlTextWriter(xmlPath, doc.Encoding);
        doc.Save(w);
        w.Close();
      }
      if (xsltArgs == null)
      {
        xsltArgs = new XsltArgumentList();
      }
      
      // add some useful variables to the xslt doc
      xsltArgs.AddParam("url", "", htmlUrl);
      xsltArgs.AddParam("requestDuration", "", RequestDuration);
      xsltArgs.AddParam("fromCache", "", FromCache);

      XslCompiledTransform xslt = new XslCompiledTransform();
      xslt.Load(xsltUrl);
      xslt.Transform(doc, xsltArgs, writer);
    }

    /// <summary>
    /// Creates an instance of the given type from the specified Internet resource.
    /// </summary>
    /// <param name="url">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
    /// <param name="type">The requested type.</param>
    /// <returns>An newly created instance.</returns>
    public object CreateInstance(string url, Type type)
    {
      return CreateInstance(url, null, null, type);
    }

    /// <summary>
    /// Creates an instance of the given type from the specified Internet resource.
    /// </summary>
    /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
    /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
    /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
    /// <param name="type">The requested type.</param>
    /// <returns>An newly created instance.</returns>
    public object CreateInstance(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, Type type)
    {
      return CreateInstance(htmlUrl, xsltUrl, xsltArgs, type, null);
    }

    /// <summary>
    /// Creates an instance of the given type from the specified Internet resource.
    /// </summary>
    /// <param name="htmlUrl">The requested URL, such as "http://Myserver/Mypath/Myfile.asp".</param>
    /// <param name="xsltUrl">The URL that specifies the XSLT stylesheet to load.</param>
    /// <param name="xsltArgs">An XsltArgumentList containing the namespace-qualified arguments used as input to the transform.</param>
    /// <param name="type">The requested type.</param>
    /// <param name="xmlPath">A file path where the temporary XML before transformation will be saved. Mostly used for debugging purposes.</param>
    /// <returns>An newly created instance.</returns>
    public object CreateInstance(string htmlUrl, string xsltUrl, XsltArgumentList xsltArgs, Type type, string xmlPath)
    {
      StringWriter sw = new StringWriter();
      XmlTextWriter writer = new XmlTextWriter(sw);
      if (xsltUrl == null)
      {
        LoadHtmlAsXml(htmlUrl, writer);
      }
      else
      {
        if (xmlPath == null)
        {
          LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer);
        }
        else
        {
          LoadHtmlAsXml(htmlUrl, xsltUrl, xsltArgs, writer, xmlPath);
        }
      }
      writer.Flush();
      StringReader sr = new StringReader(sw.ToString());
      XmlTextReader reader = new XmlTextReader(sr);
      XmlSerializer serializer = new XmlSerializer(type);
      object o = null;
      try
      {
        o = serializer.Deserialize(reader);
      }
      catch(InvalidOperationException ex)
      {
        throw new Exception(ex.ToString() + ", --- xml:" + sw.ToString());
      }
      return o;
    }

    /// <summary>
    /// Gets or Sets the cache path. If null, no caching mechanism will be used.
    /// </summary>
    public string CachePath
    {
      get
      {
        return _cachePath;
      }
      set
      {
        _cachePath = value;
      }
    }

    /// <summary>
    /// Gets or Sets a value indicating whether the caching mechanisms should be used or not.
    /// </summary>
    public bool UsingCache
    {
      get
      {
        if (_cachePath == null)
        {
          return false;
        }
        return _usingCache;
      }
      set
      {
        if ((value) && (_cachePath == null))
        {
          throw new HtmlWebException("You need to define a CachePath first.");
        }
        _usingCache = value;
      }
    }
  }
}
www.java2v.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.