// Copyright 2005 by Omar Al Zabir. All rights are reserved.
//
// If you like this code then feel free to go ahead and use it.
// The only thing I ask is that you don't remove or alter my copyright notice.
//
// Your use of this software is entirely at your own risk. I make no claims or
// warrantees about the reliability or fitness of this code for any particular purpose.
// If you make changes or additions to this code please mark your code as being yours.
//
// website http://www.oazabir.com, email OmarAlZabir@gmail.com, msn oazabir@hotmail.com
using System;
using System.Globalization;
using System.Text;
using System.IO;
using System.Collections;
using System.Xml;
using System.Security.Cryptography;
namespace RSSCommon{
/// <summary>
/// Summary description for FeedProcessor.
/// </summary>
public class FeedProcessor
{
private MD5CryptoServiceProvider _MD5 = new MD5CryptoServiceProvider();
/// <summary>
/// Parse the XML Document and find out what type of feed is it.
/// Based on the type, build a channel collection of feeds
/// </summary>
/// <param name="doc"></param>
public IList Parse( XmlReader reader )
{
IList channels = new ArrayList();
while( reader.Read() )
{
if( reader.NodeType == XmlNodeType.Element )
{
string name = reader.Name.ToLower();
switch( name )
{
case "atom:feed": // We have Atom Feed
case "feed": // We have Atom Feed
channels.Add( this.ProcessAtomFeed( reader ) );
break;
case "rdf:rdf": // We have rdf feed
case "rdf": // We have rdf feed
channels.Add( this.ProcessRdfFeed( reader ) );
break;
case "rss:rss": // We have rss feed
case "rss": // We have rss feed
channels.Add( this.ProcessRssFeed( reader ) );
break;
}
}
}
return channels;
}
/// <summary>
/// Process an atom feed node
/// </summary>
/// <param name="reader"></param>
/// <returns></returns>
private RssChannel ProcessAtomFeed( XmlReader reader )
{
RssChannel channel = new RssChannel();
channel.Type = RssTypeEnum.Atom;
channel.Feeds = new ArrayList();
while( reader.Read() )
{
if( reader.NodeType == XmlNodeType.Element )
{
string name = reader.Name;
switch( name )
{
case "title": // title for channel
channel.Title = ReadString( reader );
break;
case "link": // link to website
reader.MoveToAttribute("href");
if( reader.ReadAttributeValue() )
{
channel.Link = reader.Value;
}
break;
case "tagline": // description of the channel
channel.Description = ReadString( reader );
break;
case "description": // Same
channel.Description = ReadString( reader );
break;
case "entry": // Aha! an entry
channel.Feeds.Add( this.ProcessAtomEntry( reader ) );
break;
}
}
else if( reader.NodeType == XmlNodeType.EndElement )
{
if( reader.Name == "feed" )
break;
}
}
return channel;
}
private RssChannel ProcessRssFeed( XmlReader reader )
{
RssChannel channel = new RssChannel();
channel.Type = RssTypeEnum.RSS;
string tagName = reader.Name;
channel.Feeds = new ArrayList();
while( reader.Read() )
{
if( reader.NodeType == XmlNodeType.Element )
{
string name = reader.Name;
switch( name )
{
case "title":
channel.Title = ReadString( reader );
break;
case "link":
channel.Link = ReadString( reader );
break;
case "description":
channel.Description = ReadString( reader );
break;
case "item":
channel.Feeds.Add( this.ProcessRssItem( reader ) );
break;
}
}
if( reader.NodeType == XmlNodeType.EndElement )
{
if( reader.Name == tagName )
break;
}
}
return channel;
}
private RssChannel ProcessRdfFeed( XmlReader reader )
{
RssChannel channel = new RssChannel();
channel.Type = RssTypeEnum.RSS;
string tagName = reader.Name;
channel.Feeds = new ArrayList();
while( reader.Read() )
{
if( reader.NodeType == XmlNodeType.Element )
{
string name = reader.Name;
switch( name )
{
case "title":
channel.Title = ReadString( reader );
break;
case "link":
channel.Link = ReadString( reader );
break;
case "description":
channel.Description = ReadString( reader );
break;
case "item":
channel.Feeds.Add( this.ProcessRdfItem( reader ) );
break;
}
}
if( reader.NodeType == XmlNodeType.EndElement )
{
if( reader.Name == tagName )
break;
}
}
return channel;
}
/// <summary>
/// Process Atom feed entry
/// </summary>
/// <param name="reader"></param>
/// <returns>An RSS Feed object which contains a post</returns>
private RssFeed ProcessAtomEntry( XmlReader reader )
{
return ProcessFeedNode( reader, "entry", "title", "id", "link", "issued" );
}
/// <summary>
/// Process a entry node which contains one post. This method is generic for Atom and RSS Feeds.
///
/// All the Atom/RSS feed specified node names are specified via parameters.
/// </summary>
/// <param name="reader">Reader positioned to the entry node</param>
/// <param name="itemNodeName">Name of the node which identifies the post. For RSS it's "item", for Atom is's "entry"</param>
/// <param name="titleNodeName">Name of title node</param>
/// <param name="guidNodeName">Name of GUID node.</param>
/// <param name="linkNodeName">Name of link node</param>
/// <param name="pubDateNodeName">Name of publish date node</param>
/// <returns>Returns a populated feed object</returns>
private RssFeed ProcessFeedNode( XmlReader reader, string itemNodeName,
string titleNodeName, string guidNodeName, string linkNodeName,
string pubDateNodeName )
{
RssFeed feed = new RssFeed();
// Build a buffer which stores the entire XML content of the entry
StringBuilder buffer = new StringBuilder( 1024 );
XmlTextWriter writer = new XmlTextWriter( new StringWriter( buffer ) );
writer.Namespaces = false;
writer.Indentation = 1;
writer.IndentChar = '\t';
writer.Formatting = Formatting.Indented;
writer.WriteStartElement(itemNodeName);
string lastNode = reader.Name;
while( (reader.NodeType == XmlNodeType.Element && lastNode != reader.Name) || reader.Read() )
{
if( reader.NodeType == XmlNodeType.Element )
{
lastNode = reader.Name;
writer.WriteStartElement( reader.Name );
writer.WriteAttributes( reader, true );
if( reader.Name == titleNodeName )
{
feed.Title = ReadString( reader );
writer.WriteString(feed.Title);
}
else if( reader.Name == guidNodeName )
{
feed.Guid = ReadString( reader );
writer.WriteString(feed.Guid);
}
else if( reader.Name == linkNodeName )
{
// Atom feed contains the link as "href" attribute
string link = reader.GetAttribute("href", "");
if( null == link )
{
// but Rss feed has the link as value
link = ReadString( reader );
writer.WriteString( link );
}
if( feed.Guid == null )
{
feed.Guid = link;
}
}
else if( reader.Name == pubDateNodeName )
{
string date = ReadString( reader );
feed.PublishDate = this.FormatDate( date );
writer.WriteString(date);
}
else
{
writer.WriteRaw( reader.ReadInnerXml() );
}
// Close the element started
writer.WriteEndElement();
// For empty elements, ReadEndElement fails
if( reader.NodeType == XmlNodeType.EndElement )
{
if( reader.Name == itemNodeName ) break;
reader.ReadEndElement();
}
}
if( reader.NodeType == XmlNodeType.EndElement )
{
if( reader.Name == itemNodeName )
break;
}
}
writer.WriteEndElement();
writer.Close();
feed.XML = buffer.ToString();
// If nither GUID nor Link is found, calculate hash of the post and make
// it GUID
if( feed.Guid == null || feed.Guid.Length == 0 )
{
byte [] titleBytes = System.Text.Encoding.UTF8.GetBytes(feed.Title);
byte [] hashBytes = _MD5.ComputeHash(titleBytes);
string hashString = Convert.ToBase64String(hashBytes);
feed.Guid = hashString;
}
return feed;
}
private RssFeed ProcessRssItem( XmlReader reader )
{
return ProcessFeedNode( reader, "item", "title", "guid", "link", "pubDate" );
}
private RssFeed ProcessRdfItem( XmlReader reader )
{
return ProcessFeedNode( reader, "item", "title", "guid", "link", "dc:date" );
}
private DateTime FormatDate( string date )
{
string RFC822 = "ddd, dd MMM yyyy HH:mm:ss zzz";
string RFC1123 = "yyyyMMddTHHmmss";
string RFCUnknown = "yyyy-MM-ddTHH:mm:ssZ";
//int indexOfPlus = date.LastIndexOf('+');
//if( indexOfPlus > 0 ) date = date.Substring( 0, indexOfPlus-1 );
string [] formats = new string[] { "r", "S", "U" };
try
{
// Parse the dates using the standard universal date format
return DateTime.Parse(date, CultureInfo.InvariantCulture,
DateTimeStyles.AdjustToUniversal);
}
catch
{
try
{
// Standard formats failed, try the "r" "S" and "U" formats
return DateTime.ParseExact( date, formats, DateTimeFormatInfo.InvariantInfo, DateTimeStyles.AdjustToUniversal );
}
catch
{
try
{
// All the standards formats have failed, try the dreaded RFC822 format
return DateTime.ParseExact( date, RFC822, DateTimeFormatInfo.InvariantInfo, DateTimeStyles.AdjustToUniversal );
}
catch
{
try
{
// All the standards formats have failed, try the dreaded RFC822 format
return DateTime.ParseExact( date, RFC1123, DateTimeFormatInfo.InvariantInfo, DateTimeStyles.AdjustToUniversal );
}
catch
{
try
{
// All the standards formats have failed, try the dreaded RFC822 format
return DateTime.ParseExact( date, RFCUnknown, DateTimeFormatInfo.InvariantInfo, DateTimeStyles.AdjustToUniversal );
}
catch
{
try
{
// Last try, may be the date ends with some +600 or something, remove that
string strippedDate = date.Substring (0, date.Length - 5);
strippedDate += "GMT";
return Convert.ToDateTime(strippedDate);
}
catch
{
// All failed! The RSS Feed source should be sued
return DateTime.Now;
}
}
}
}
}
}
}
private StringBuilder buffer = new StringBuilder(100);
/// <summary>
/// Alternative to reader.ReadString. The ReadString method of XmlReader not
/// only reads the string inside the node, but also jumps over the node end tag.
/// We need to read only the characters inside the tag pair, and stop when the
/// end tag is reached.
/// </summary>
/// <param name="reader"></param>
/// <returns></returns>
private string ReadString( XmlReader reader )
{
/// Reuse existing buffer in order to prevent frequent StringBuffer allocation
buffer.Length = 0;
/// Empty elements have no content
if( reader.IsEmptyElement ) return string.Empty;
/// Skip the begin tag and all white spaces before the first character of content is found
while(!reader.EOF
&& ( reader.NodeType == XmlNodeType.Element
|| reader.NodeType == XmlNodeType.Whitespace ) )
reader.Read();
/// Read and store in buffer when we are getting text and CDATA sections. But stop immediately
/// whenever we read the end element.
while( reader.NodeType == XmlNodeType.CDATA
|| reader.NodeType == XmlNodeType.Text
&& reader.NodeType != XmlNodeType.EndElement )
{
buffer.Append( reader.Value );
reader.Read();
}
/// Now the read is poting to the EndElement. Return the content of the buffer
/// we have prepared for this node
return buffer.ToString();
}
}
}
|