# $SnapHashLicense:
#
# SnapLogic - Open source data services
#
# Copyright (C) 2008, SnapLogic, Inc. All rights reserved.
#
# See http://www.snaplogic.org for more information about
# the SnapLogic project.
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
#
# "SnapLogic" is a trademark of SnapLogic, Inc.
#
#
# $
# $Id: RssDocument.py 1438 2008-03-10 19:02:16Z dhiraj $
"""
Module for RSS Document object.
This module contains a RssDocument object for use in creating and formating XML DOM documents comforming to
rss20 and atom10 protocols for a given channel/feed or item/entry element.
"""
import xml.dom.minidom
from feedparser import FeedParserDict
import RssUtils
from snaplogic.common.snap_exceptions import *
class RssDocument:
"""
This class provides intefaces for formating an XML DOM document confirming to rss20 or atom10 document.
This class takes the feedparser object and creates an XML DOM based on the feedparser object values.
The protocols supported are 'rss20' and 'atom10'. Although the feedparser object contains mostly
normalized object attributes, there are protocol specific attributes that are used differently when producing
the resulting document in different protocol.
"""
def __init__(self):
"""
Initialization.
"""
pass
def createFeed(self, feed, proto):
"""
Create an XML DOM document for a Feed element.
@param feed: The feedparser top-level object.
@param feed: FeedParserDict
@param proto: The protocol of the feed.
@type proto: string
@return: The document (of xml.dom.minidom.Document type) of the feed.
"""
if proto == 'rss20':
return self.createRssChannel(feed)
if proto == 'atom10':
return self.createAtomFeed(feed)
raise SnapValueError('RssDocument', 'createFeed', 'protocol', proto)
def createItem(self, item, proto, feeddoc = None):
"""
Create an XML DOM document for an Item element.
@param item: The feedparser entry object.
@param item: FeedParserDict
@param proto: The protocol of the feed.
@type proto: string
@param feeddoc: The feed document that this item element is appended to. If it is None, a new Document
is created and appends this item to it. When it is not None, the top level element of the feed,
the child of the feed document for Atom10, the grandchild of the feed document for Rss20, is
appending the new item element.
@type feeddoc: xml.dom.minidom.Document
@return: The document (of xml.dom.minidom.Document type) of the entry.
"""
if proto == 'rss20':
return self.createRssItem(item, feeddoc)
if proto == 'atom10':
return self.createAtomEntry(item, feeddoc)
raise SnapValueError('RssDocument', 'createItem', 'protocol', proto)
def createRssChannel(self, feed):
"""
Create an XML DOM document for a Feed element of rss20 protocol.
@param feed: The feedparser top-level object.
@param feed: FeedParserDict
@return: The document (of xml.dom.minidom.Document type) of the feed.
"""
feeddoc = xml.dom.minidom.Document()
# First level element - 'rss'
elem_rss = feeddoc.createElement('rss')
elem_rss.setAttribute('version', '2.0')
feeddoc.appendChild(elem_rss)
# Second level element - 'channel'
elem_ch = feeddoc.createElement('channel')
elem_rss.appendChild(elem_ch)
# Child elements of 'channel'
for k, v in feed.iteritems():
# A few attributes are list
if k == 'namespaces':
for p, u in v.iteritems():
# Just be cautious not to include the obvious invalid namespace
if u == 'http://www.w3.org/2005/Atom': continue # Document won't be displayed by browsers
if p: elem_rss.setAttribute('xmlns:' + p, u)
# else: elem_rss.setAttribute('xmlns', u)
elif k == 'links':
# Use it only when link attribute is not here
if not feed.has_key('link'):
# The value (v) is a list, take the first one, or the one with relation of 'self'
m = None
for a in v:
# Find the link that is of self relation
if a.has_key('rel') and a['rel'] == 'self':
# Make sure there is href
if a.has_key('href') and a['href']:
m = a
break
if not m:
# Take the first link
for a in v:
# Make sure there is href
if a.has_key('href') and a['href']:
m = a
break
if m:
# Found a link, create the element
e = feeddoc.createElement('link')
c = feeddoc.createTextNode(m['href'])
e.appendChild(c)
elem_ch.appendChild(e)
elif k == 'generator':
# Use generator_detail if it exists
if not feed.has_key('generator_detail'):
e = feeddoc.createElement('generator')
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_ch.appendChild(e)
elif k == 'generator_detail':
if v.has_key('name'):
e = feeddoc.createElement('generator')
c = feeddoc.createTextNode(v['name'])
if v.has_key('href'):
e.setAttribute('uri', v['href'])
if v.has_key('version'):
e.setAttribute('version', v['version'])
e.appendChild(c)
elem_ch.appendChild(e)
elif k == 'subtitle':
e = feeddoc.createElement('description')
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_ch.appendChild(e)
elif k == 'rights':
e = feeddoc.createElement('copyright')
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_ch.appendChild(e)
elif k == 'published' and v:
e = feeddoc.createElement('pubDate')
t = RssUtils.timeDateTimeToString (v, 'rss20')
if t: t += ' GMT'
c = feeddoc.createTextNode(t)
e.appendChild(c)
elem_ch.appendChild(e)
elif k == 'updated' and v:
e = feeddoc.createElement('lastBuildDate')
t = RssUtils.timeDateTimeToString (v, 'rss20')
if t: t += ' GMT'
c = feeddoc.createTextNode(t)
e.appendChild(c)
elem_ch.appendChild(e)
# elif k == 'id':
# Ignore this attribute
# pass
# TODO: filtered out rss20-undefined attributes
elif v:
e = feeddoc.createElement(k)
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_ch.appendChild(e)
return feeddoc
def createAtomFeed(self, feed):
"""
Create an XML DOM document for a Feed element of atom10 protocol.
@param feed: The feedparser top-level object.
@param feed: FeedParserDict
@return: The document (of xml.dom.minidom.Document type) of the feed.
"""
feeddoc = xml.dom.minidom.Document()
# First level element - 'feed'
elem_feed = feeddoc.createElement('feed')
elem_feed.setAttribute('xmlns', 'http://www.w3.org/2005/Atom')
feeddoc.appendChild(elem_feed)
# Child elements of 'feed'
for k, v in feed.iteritems():
# A few attributes are list
if k == 'namespaces':
for p, u in v.iteritems():
# Do not set xmlns attribute, previously set already
if p: elem_feed.setAttribute('xmlns:' + p, u)
elif k == 'links':
for a in v:
if a.has_key('href') and a['href']:
e = feeddoc.createElement('link')
for m, n in a.iteritems():
e.setAttribute(m, n)
elem_feed.appendChild(e)
elif k == 'generator':
if not feed.has_key('generator_detail'):
e = feeddoc.createElement('generator')
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_feed.appendChild(e)
elif k == 'generator_detail':
if v.has_key('name'):
e = feeddoc.createElement('generator')
c = feeddoc.createTextNode(v['name'])
if v.has_key('href'):
e.setAttribute('uri', v['href'])
if v.has_key('version'):
e.setAttribute('version', v['version'])
e.appendChild(c)
elem_feed.appendChild(e)
elif k == 'published' and v:
e = feeddoc.createElement(k)
t = RssUtils.timeDateTimeToString (v, 'atom10')
c = feeddoc.createTextNode(t)
e.appendChild(c)
elem_feed.appendChild(e)
elif k == 'updated' and v:
e = feeddoc.createElement('updated')
t = RssUtils.timeDateTimeToString (v, 'atom10')
c = feeddoc.createTextNode(t)
e.appendChild(c)
elem_feed.appendChild(e)
# TODO: filtered out atom10-undefined attributes
elif v:
e = feeddoc.createElement(k)
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_feed.appendChild(e)
return feeddoc
def createRssItem(self, item, feeddoc = None):
"""
Create an XML DOM document for an Item element of rss20 protocol.
@param item: The feedparser entry object.
@param item: FeedParserDict
@param feeddoc: The feed document that this item element is appended to. If it is None, a new Document
is created and appends this item to it. When it is not None, the top level element of the feed,
the child of the feed document for Atom10, the grandchild of the feed document for Rss20, is
appending the new item element.
@type feeddoc: xml.dom.minidom.Document
@return: The document (of xml.dom.minidom.Document type) of the entry.
"""
if not feeddoc:
feeddoc = xml.dom.minidom.Document()
root = feeddoc
else:
# Get the 'channel' element, child element of the top 'rss' element
root = feeddoc.getElementsByTagName('channel')[0]
# The 'item' element.
elem_item = feeddoc.createElement('item')
root.appendChild(elem_item)
# Child elements of 'item'
for k, v in item.iteritems():
if k == 'link':
if not item.has_key('links'):
e = feeddoc.createElement(k)
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_item.appendChild(e)
elif k == 'links':
for a in v:
e = feeddoc.createElement('link')
c = feeddoc.createTextNode(a.get('href', ''))
e.appendChild(c)
elem_item.appendChild(e)
# Allow only one 'link' element, take the first one.
break
elif k == 'author':
if not item.has_key('author_detail'):
e = feeddoc.createElement('author')
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_item.appendChild(e)
elif k == 'author_detail':
if v.has_key('name'):
e = feeddoc.createElement('author')
c = feeddoc.createElement('name')
t = feeddoc.createTextNode(v['name'])
c.appendChild(t)
e.appendChild(c)
if v.has_key('href'):
c = feeddoc.createElement('uri')
t = feeddoc.createTextNode(v['href'])
c.appendChild(t)
e.appendChild(c)
if v.has_key('email'):
c = feeddoc.createElement('email')
t = feeddoc.createTextNode(v['email'])
c.appendChild(t)
e.appendChild(c)
e.appendChild(c)
elem_item.appendChild(e)
elif k == 'contributors':
# Ignore it.
pass
elif (k == 'published' or k == 'updated') and v:
e = feeddoc.createElement('pubDate')
t = RssUtils.timeDateTimeToString (v, 'rss20')
if t: t += ' GMT'
c = feeddoc.createTextNode(t)
e.appendChild(c)
elem_item.appendChild(e)
elif k == 'content':
e = feeddoc.createElement('description')
if type(v) == list: d = v[0]
elif type(v) == dict: d = v
elif isinstance(v, FeedParserDict): d = v
t = d.get('type', 'text/plain')
if t == 'text' or t == 'text/plain':
c = xml.dom.minidom.CDATASection()
c.data = d['value']
else:
c = feeddoc.createTextNode(d['value'])
# Some browser doesn't display 'text/html'
if t == 'text/html': t = 'html'
e.appendChild(c)
e.setAttribute('type', t)
elem_item.appendChild(e)
elif k == 'summary' and v:
if not item.has_key('content'):
e = feeddoc.createElement('description')
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_item.appendChild(e)
elif k == 'summary_detail':
# Ignore it
pass
elif k == 'tags':
for t in v:
e = feeddoc.createElement('category')
c = feeddoc.createTextNode(t.get('term', ''))
if t.get('scheme', ''):
e.setAttribute('domain', t.get('scheme', ''))
e.appendChild(c)
elem_item.appendChild(e)
# TODO: filtered out atom10-undefined attributes
elif v:
e = feeddoc.createElement(k)
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_item.appendChild(e)
return feeddoc
def createAtomEntry(self, item, feeddoc = None):
"""
Create an XML DOM document for an Entry element of atom10 protocol.
@param item: The feedparser entry object.
@param item: FeedParserDict
@param feeddoc: The feed document that this item element is appended to. If it is None, a new Document
is created and appends this item to it. When it is not None, the top level element of the feed,
the child of the feed document for Atom10, the grandchild of the feed document for Rss20, is
appending the new item element.
@type feeddoc: xml.dom.minidom.Document
@return: The document (of xml.dom.minidom.Document type) of the entry.
"""
if not feeddoc:
feeddoc = xml.dom.minidom.Document()
root = feeddoc
else:
# Get the top 'feed' element
root = feeddoc.documentElement
# The 'entry' element.
elem_ent = feeddoc.createElement('entry')
root.appendChild(elem_ent)
# Child elements of 'item'
for k, v in item.iteritems():
if k == 'link':
e = feeddoc.createElement(k)
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_ent.appendChild(e)
elif k == 'links':
for a in v:
e = feeddoc.createElement('link')
for m, n in a.iteritems():
e.setAttribute(m, n)
elem_ent.appendChild(e)
elif k == 'author':
if not item.has_key('author_detail'):
e = feeddoc.createElement('author')
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_ent.appendChild(e)
elif k == 'author_detail':
if v.has_key('name'):
e = feeddoc.createElement('author')
c = feeddoc.createElement('name')
t = feeddoc.createTextNode(v['name'])
c.appendChild(t)
e.appendChild(c)
if v.has_key('href'):
c = feeddoc.createElement('uri')
t = feeddoc.createTextNode(v['href'])
c.appendChild(t)
e.appendChild(c)
if v.has_key('email'):
c = feeddoc.createElement('email')
t = feeddoc.createTextNode(v['email'])
c.appendChild(t)
e.appendChild(c)
e.appendChild(c)
elem_ent.appendChild(e)
elif k == 'contributors':
for a in v:
if a.has_key('name'):
e = feeddoc.createElement('contributor')
c = feeddoc.createElement('name')
t = feeddoc.createTextNode(a['name'])
c.appendChild(t)
e.appendChild(c)
if a.has_key('href'):
c = feeddoc.createElement('uri')
t = feeddoc.createTextNode(a['href'])
c.appendChild(t)
e.appendChild(c)
if a.has_key('email'):
c = feeddoc.createElement('email')
t = feeddoc.createTextNode(a['email'])
c.appendChild(t)
e.appendChild(c)
e.appendChild(c)
elem_ent.appendChild(e)
elif (k == 'published' or k == 'updated') and v:
e = feeddoc.createElement(k)
t = RssUtils.timeDateTimeToString (v, 'atom10')
c = feeddoc.createTextNode(t)
e.appendChild(c)
elem_ent.appendChild(e)
elif k == 'content':
e = feeddoc.createElement(k)
# Support only one content field for now ...
if type(v) == list: d = v[0]
elif type(v) == dict: d = v
elif isinstance(v, FeedParserDict): d = v
t = d.get('type', 'text/plain')
if t == 'text' or t == 'text/plain':
c = xml.dom.minidom.CDATASection()
c.data = d['value']
else:
c = feeddoc.createTextNode(d['value'])
# Some browser doesn't display 'text/html'
if t == 'text/html': t = 'html'
e.appendChild(c)
e.setAttribute('type', t)
b = d.get('base', '')
if b: e.setAttribute('xml:base', b)
g = d.get('language', '')
if g: e.setAttribute('xml:lang', g)
elem_ent.appendChild(e)
elif k == 'summary' and v:
if not item.has_key('summary_detail') and not item.has_key('content'):
e = feeddoc.createElement('content')
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_ent.appendChild(e)
elif k == 'summary_detail':
e = feeddoc.createElement('content')
t = v.get('type', 'text/plain')
if t == 'text' or t == 'text/plain':
c = xml.dom.minidom.CDATASection()
c.data = d['value']
else:
c = feeddoc.createTextNode(d['value'])
e.appendChild(c)
e.setAttribute('type', t)
b = v.get('base', '')
if b: e.setAttribute('base', b)
elem_ent.appendChild(e)
elif k == 'tags':
for t in v:
e = feeddoc.createElement('category')
e.setAttribute('term', t.get('term', ''))
e.setAttribute('label', t.get('label', ''))
e.setAttribute('scheme', t.get('scheme', ''))
elem_ent.appendChild(e)
# TODO: filtered out atom10-undefined attributes
elif v:
e = feeddoc.createElement(k)
c = feeddoc.createTextNode(v)
e.appendChild(c)
elem_ent.appendChild(e)
return feeddoc
|