# $SnapHashLicense:
#
# SnapLogic - Open source data services
#
# Copyright (C) 2009, SnapLogic, Inc. All rights reserved.
#
# See http://www.snaplogic.org for more information about
# the SnapLogic project.
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
#
# "SnapLogic" is a trademark of SnapLogic, Inc.
#
#
# $
# $Id: RssRead.py 10330 2009-12-24 22:13:38Z grisha $
"""
RssRead Module for RSS Reader.
This module contains a RssRead object for component operation, a RssReadResDef object for resource definition,
and a RssReadCapability object for capability description.
"""
import re, os, time, datetime
import traceback
import feedparser
from feedparser import FeedParserDict
from snaplogic.common.Rss import RssUtils
from snaplogic.common.SnapReader import SnapReader
from snaplogic.common.SnapReader import SnapFtpReader
from snaplogic.common.SnapReader import SnapHttpReader
from snaplogic.common.SnapReader import SnapFileReader
from snaplogic.common.snap_exceptions import *
from snaplogic.common import snap_log
from snaplogic.common.data_types import SnapString,SnapDateTime
from snaplogic.common import version_info
import snaplogic.components as components
import snaplogic.components.FileUtils as FileUtils
from snaplogic.cc import component_api
from snaplogic.cc.component_api import ComponentAPI
import snaplogic.cc.prop as prop
# Public names
__all__ = [ "RssRead"]
class RssRead(ComponentAPI):
"""
Class RssRead.
This class provides interfaces to read a RSS site and parse the feed into several output views.
The design of output views are to accommodate as much information as possible to pass along
to the downstream component(s). The views and fields are normalized contents for both RSS and Atom
protocols. There are still some attributes in either RSS or Atom not captured in these views and
fields. They will be documented as known issue and this class can be extended easily to accommodate
them by adding them to the views with a few additional lines of new code for the processing.
"""
api_version = '1.0'
component_version = '1.2'
capabilities = {
ComponentAPI.CAPABILITY_INPUT_VIEW_LOWER_LIMIT : 0,
ComponentAPI.CAPABILITY_INPUT_VIEW_UPPER_LIMIT : 0,
ComponentAPI.CAPABILITY_OUTPUT_VIEW_LOWER_LIMIT : 2,
ComponentAPI.CAPABILITY_OUTPUT_VIEW_UPPER_LIMIT : 9,
}
component_description = "This component interfaces to read a RSS site and parse the feed into several output views."
component_label = "RSS Reader"
component_doc_uri = "https://www.snaplogic.org/trac/wiki/Documentation/%s/ComponentRef/RssRead" % \
version_info.doc_uri_version
def create_resource_template(self):
"""
Create RssRead resource template.
This method defines component specific resource properties. It also defines output views and their fields.
There are 2 minimum and up to 9 output views. The two required output views are 'feed' and 'item' views.
'feed' view consists of one single record with information about the feed headers.
'item' view consists of records, each with information about an individual item/entry of
the feed.
Information in 'feed' and 'item' views are essential core attributes of the feed and item
elements. There will be additional information, such as links, author, contributors, sent
through other output views.
Other output views are 'link', 'author', 'contributor', 'category', content', 'namespace',
and 'generator'. Each record in these views will have an id field to associate it with
the the feed or item it belongs to. These views are used to store the feed or item's child elements
that are either allowed to have 0 or more of them or have multiple attributes.
For example, 'link' views are used to store multiple 'link' elements of a feed or item.
<link rel="self" href="http://...">
<link rel="alternate" href="http://...">
Another example: 'author' element has multiple attrites:
<auther name="thename" uri="http://..." email="thename@...">
"""
self.set_property_def("InputURL", prop.SimpleProp("Input URL", SnapString,
"The URI of the RSS feed. You can enter either a URI pointing to a HTTP or FTP server, or a local file path. The following URI schemes are supported: file, http, https, ftp, file.\n"
"\n\nExamples of valid URIs:\n"
"http://www.server.com/dir/input.file\n"
"https://www.server.com/dir/input.file"
"/home/user/input.file\n"
"c:\dir\input.file\n"
"file:///c:/dir/input.file\n"
"ftp://ftp.server.com/dir/input.file\n"
, None, True))
# Credentials are username and password
self.set_property_def("username",
prop.SimpleProp("Credentials: Username", SnapString,
"Username to use if credentials are needed for the accessing data"))
self.set_property_value("username", "")
self.set_property_def("password",
prop.SimpleProp("Credentials: Password", SnapString,
"Password to use if credentials are needed for the accessing data",
{'obfuscate' : 0}))
output_views = [
(
"feed",
(
( "id", SnapString, "The channel/feed ID." ),
( "link", SnapString, "The link to this channel/feed." ),
( "title", SnapString, "The channel/feed title." ),
( "version", SnapString, "The protocol version this channel/feed." ),
( "updated", SnapDateTime, "The last updated time of this channel/feed." ),
( "encoding", SnapString, "The encoding scheme." ),
( "language", SnapString, "The language of this channel/feed." ),
( "copyright", SnapString, "The copyright of this channel/feed." ),
( "description", SnapString, "The description of this channel/feed." ),
),
"The feed view"
),
(
"item",
(
( "id", SnapString, "The item/entry ID." ),
( "link", SnapString, "The link to this item/entry." ),
( "title", SnapString, "The item/entry title." ),
( "published", SnapDateTime, "The published time of this item/entry." ),
( "updated", SnapDateTime, "The last updated time of this item/entry." ),
( "summary", SnapString, "The description of this item/entry." ),
),
"The item view"
),
(
"link",
(
( "id", SnapString, "The channel/feed or item/entry ID." ),
( "href", SnapString, "The IRI reference of the link." ),
( "rel", SnapString, "The relation type of the link." ),
( "type", SnapString, "The advisory media type." ),
( "hreflang", SnapString, "The language of the resource pointed to by href." ),
( "title", SnapString, "The human-readable information about the link." ),
( "length", SnapString, "The dvisory length of the linked content in octets." ),
),
"The link view"
),
(
"author",
(
( "id", SnapString, "The channel/feed or item/entry ID." ),
( "name", SnapString, "The human-readable name of the person." ),
( "href", SnapString, "The IRI associated with the person." ),
( "email", SnapString, "The email address associated with the person." ),
),
"The author view"
),
(
"contributor",
(
( "id", SnapString, "The channel/feed or item/entry ID." ),
( "name", SnapString, "The human-readable name of the person." ),
( "href", SnapString, "The IRI associated with the person." ),
( "email", SnapString, "The email address associated with the person." ),
),
"The contributor view"
),
(
"category",
(
( "id", SnapString, "The channel/feed or item/entry ID." ),
( "term", SnapString, "The string identifies the category." ),
( "scheme", SnapString, "The IRI that identifies a categorization." ),
( "label", SnapString, "The human-readable label for display in end-user applications." ),
),
"The category view"
),
(
"content",
(
( "id", SnapString, "The channel/feed or item/entry ID." ),
( "type", SnapString, "The type of the content, e.g. text, xhtml, etc." ),
( "value", SnapString, "The content value string." ),
( "xml_base", SnapString, "The xml:base attribute." ),
( "xml_lang", SnapString, "The xml:lang attribute." ),
),
"The content view"
),
(
"namespace",
(
( "id", SnapString, "The channel/feed or item/entry ID." ),
( "uri", SnapString, "The uri of the namespace." ),
( "prefix", SnapString, "The prefix of the namespace." ),
),
"The namespace view"
),
(
"generator",
(
( "id", SnapString, "The channel/feed or item/entry ID." ),
( "href", SnapString, "The uri of the generator." ),
( "name", SnapString, "The generator name." ),
( "version", SnapString, "The version of the generator." ),
),
"The generator view"
),
]
for (name, fields, doc) in output_views:
self.add_record_output_view_def(name, fields, doc)
def validate(self, err_obj):
"""Validate that InputURL has been specified."""
# The necessary validation can currently be expressed in property constraints,
# thus the CC can do it all for us.
pass
def execute(self, input_views, output_views):
# Read properties and verify the existence of required output views. The required output views make
# this component meaningful. They are 'feed' and 'item' views, each contains information about the
# feed and each item respectively. There will be only one record for the 'feed' view, and 0 or more
# for the 'item' view. Users can choose to omit other views, therefore they are optional.
self._inputurl = self.get_property_value('InputURL')
self._username = self.get_property_value("username")
self._password = self.get_property_value("password")
self.input_views = input_views
self.output_views = output_views
# KeyError exception will be raised if they don't exist
#v = self._res_def.getOutputView('feed')
#v = self._res_def.getOutputView('item')
# Create the reader for inputurl ...
self._rdr = SnapReader.create(self._inputurl, self._username, self._password, None, self.env)
if self._rdr.scheme == 'file':
FileUtils.init_file_component(self, 'InputURL')
if self._rdr.scheme.startswith('http'):
# Let feedparser access the URL directly for HTTP protocol.
d = feedparser.parse(self._rdr.input)
else:
# Use Reader method to read the input and pass the data to feedparser.
self._rdr.open()
d = feedparser.parse(self._rdr.read())
self._rdr.close()
# The bozo attribute signals the detection of a non-well-formed XML document.
# In situations where the HTTP Response Content-Type does not agree with the encoding
# specified in the XML prefix, this attribute (bozo) is also set to 1.
if d.bozo != 0:
# The string here is a feedparser specific exception message.
# It is tested with feedparser version 4.1, revision 1.92.
# The feedparser takes the HTTP Response Content-Type into account to determine the
# encoding of the received document. When the final conclusion does not agree with
# what is specified in the XML prefix, a message such as
# 'documented declared as us-ascii, but parsed as utf-8' is raised.
# The following code is trying to recover from that situation, because most browsers and
# RSS Readers are able to display the document and so we should as well.
if str(d.bozo_exception).startswith('documented declared as ') and \
hasattr(d, 'status') and hasattr(d, 'encoding') and str(d.bozo_exception).endswith(', but parsed as ' + d.encoding):
# Log a message about it, but continue processing the document.
pass
# SnapLog.ServerLog().debug("RssRead: feedparser encoding exception - %s @ %s" % (str(d.bozo_exception), self._rdr.input))
else:
raise SnapIOError("RSS Read error.", self._rdr.input, str(d.bozo_exception))
# When 'bozo' is 1, the 'status' attribute does not always exist.
# Only HTTP protocol has status attribute
if self._rdr.scheme.startswith('http'):
if d.status not in [ 200, 301, 302, 307 ]:
raise SnapIOError("RSS Get error.", self._rdr.input, d.status)
# Some feeds, especially non-well-formed ones, do not have version information.
if not d.version:
raise SnapValueError("RSS Version error.", self._rdr.input, d.version)
self._write_feed(d)
if d.has_key('entries'):
for e in d.entries:
self._write_item(e)
for out in self.output_views.values():
out.completed()
def _write_feed(self, doc):
"""
Write the 'feed' record.
@param doc: The parsed document.
@type doc: FeedParserDict
"""
try:
output_view = self.output_views['feed']
except KeyError:
raise SnapComponentError("\"feed\" output view not connected.")
r = output_view.create_record()
r['id'] = unicode(doc.feed.get('id', ''))
r['link'] = unicode(doc.feed.get('link', ''))
r['title'] = unicode(doc.feed.get('title', ''))
r['version'] = unicode(doc.get('version', ''))
v = doc.get('updated', None)
if not v:
r['updated'] = None
else:
# Timezone is not handled for now
r['updated'] = datetime.datetime(v.tm_year, v.tm_mon, v.tm_mday, v.tm_hour, v.tm_min, v.tm_sec)
r['encoding'] = unicode(doc.get('encoding', ''))
r['language'] = unicode(doc.feed.get('language', ''))
r['copyright'] = unicode(doc.get('rights', ''))
r['description'] = unicode(doc.feed.get('subtitle', ''))
# ID is essential in these records. Not all feeds have the information stored in the expected element.
if not r['link']: r['link'] = unicode(doc.get('href', ''))
if not r['id']: r['id'] = r['link']
if (not r['updated']) and doc.feed.has_key('lastbuilddate'):
# Use the lastbuilddate element value.
# Let the field be None if the element value is in invalid format.
try:
t = RssUtils.timeStrToTuple(doc.feed['lastbuilddate'])
r['updated'] = datetime.datetime(t[0], t[1], t[2], t[3], t[4], t[5])
except:
pass
output_view.write_record(r)
#SnapLog.ServerLog().debug("RssRead: Version - " + r['version'])
#SnapLog.ServerLog().debug("RssRead: Feed ID - " + r['id'])
#SnapLog.ServerLog().debug("RssRead: Feed Title - " + r['title'])
self._write_link(r['id'], doc.feed)
self._write_author(r['id'], doc.feed)
self._write_contributor(r['id'], doc.feed)
self._write_category(r['id'], doc.feed)
self._write_namespace(r['id'], doc)
self._write_generator(r['id'], doc.feed)
def _write_item(self, item):
"""
Write the 'item' record.
@param item: The item dictionary in the parsed document.
@type item: FeedParserDict
"""
try:
output_view = self.output_views['item']
except KeyError:
raise SnapComponentError("\"item\" output view not connected.")
r = output_view.create_record()
r['id'] = unicode(item.get('id', ''))
r['link'] = unicode(item.get('link', ''))
r['title'] = unicode(item.get('title', ''))
r['summary'] = unicode(item.get('summary', ''))
v = item.get('updated_parsed', None)
if not v:
r['updated'] = None
else:
# Timezone is not handled for now
r['updated'] = datetime.datetime(v.tm_year, v.tm_mon, v.tm_mday, v.tm_hour, v.tm_min, v.tm_sec)
v = item.get('published_parsed', None)
if not v:
r['published'] = None
else:
# Timezone is not handled for now
r['published'] = datetime.datetime(v.tm_year, v.tm_mon, v.tm_mday, v.tm_hour, v.tm_min, v.tm_sec)
if not r['id']:
r['id'] = r['link']
output_view.write_record(r)
#SnapLog.ServerLog().debug("RssRead: Item ID - " + r['id'])
self._write_link(r['id'], item)
self._write_author(r['id'], item)
self._write_contributor(r['id'], item)
self._write_category(r['id'], item)
self._write_content(r['id'], item)
def _write_link(self, id, elem):
"""
Write the 'link' record.
@param id: The id (foreign key) for the link record.
@type id: string
@param elem: The dictionary structure of an element that may contain the 'links' element.
It can be either a feed or item/entry dictionary.
@type elem: FeedParserDict
"""
if elem.has_key('links'):
# Get the view, this is an optional view ...
try:
output_view = self.output_views['links']
except KeyError:
return
# It is a list of links (dict)
for k in elem.links:
r = output_view.create_record()
r['id'] = id
r['rel'] = unicode(k.get('rel', ''))
r['type'] = unicode(k.get('type', ''))
r['href'] = unicode(k.get('href', ''))
r['title'] = unicode(k.get('title', ''))
r['length'] = unicode(k.get('length', ''))
r['hreflang'] = unicode(k.get('hreflang', ''))
output_view.write_record(r)
#SnapLog.ServerLog().debug("RssRead: Link ID - " + id)
def _write_author(self, id, elem):
"""
Write the 'author' record.
@param id: The id (foreign key) for the author record.
@type id: string
@param elem: The dictionary structure of an element that may contain the 'author' element.
It can be either a feed or item/entry dictionary.
@type elem: FeedParserDict
"""
if elem.has_key('author_detail'):
# Get the view, this is an optional view ...
try:
output_view = self.output_views['author']
except KeyError:
return
r = output_view.create_record()
# A single dict
r['id'] = id
r['name'] = unicode(elem.author_detail.get('name', ''))
r['href'] = unicode(elem.author_detail.get('href', ''))
r['email'] = unicode(elem.author_detail.get('email', ''))
output_view.write_record(r)
#SnapLog.ServerLog().debug("RssRead: Author ID - " + id)
elif elem.has_key('author'):
# Get the view, this is an optional view ...
try:
output_view = self.output_views['author']
except KeyError:
return
r = output_view.create_record()
# Simply take the name
r['id'] = id
r['name'] = unicode(elem.author)
r['href'] = u''
r['email'] = u''
output_view.write_record(r)
#SnapLog.ServerLog().debug("RssRead: Author ID - " + id)
def _write_contributor(self, id, elem):
"""
Write the 'contributor' record.
@param id: The id (foreign key) for the contributor record.
@type id: string
@param elem: The dictionary structure of an element that may contain the 'contributor' element.
It can be either a feed or item/entry dictionary.
@type elem: FeedParserDict
"""
if elem.has_key('contributors'):
# Get the view, this is an optional view ...
try:
output_view = self.output_views['author']
except KeyError:
return
# It is a list of contributors (dict)
for c in elem.contributors:
r = output_view.create_record()
r['id'] = id
r['name'] = unicode(c.get('name', ''))
r['href'] = unicode(c.get('href', ''))
r['email'] = unicode(c.get('email', ''))
output_view.write_record(r)
#SnapLog.ServerLog().debug("RssRead: Contributor ID - " + id)
def _write_category(self, id, elem):
"""
Write the 'category' record.
@param id: The id (foreign key) for the category record.
@type id: string
@param elem: The dictionary structure of an element that may contain the 'category' element.
It can be either a feed or item/entry dictionary.
@type elem: FeedParserDict
"""
if elem.has_key('tags'):
# Get the view, this is an optional view ...
try:
output_view = self.output_views['category']
except KeyError:
return
for tag in elem.tags:
r = output_view.create_record()
r['id'] = id
r['term'] = unicode(tag.get('term', ''))
r['label'] = unicode(tag.get('label', ''))
r['scheme'] = unicode(tag.get('scheme', ''))
output_view.write_record(r)
#SnapLog.ServerLog().debug("RssRead: Category ID - " + id)
def _write_content(self, id, elem):
"""
Write the 'content' record.
@param id: The id (foreign key) for the content record.
@type id: string
@param elem: The dictionary structure of an item that may contain the 'content' element.
Only Atom Entry has content attribute.
@type elem: FeedParserDict
"""
if elem.has_key('content') or elem.has_key('summary_detail'):
# Get the view, this is an optional view ...
try:
output_view = self.output_views['content']
except KeyError:
return
# It is a list of contents (dict)
if elem.has_key('content'):
for c in elem.content:
r = output_view.create_record()
r['id'] = id
r['type'] = unicode(c.get('type', ''))
r['value'] = unicode(c.get('value', ''))
r['xml_base'] = unicode(c.get('base', ''))
r['xml_lang'] = unicode(c.get('language', ''))
output_view.write_record(r)
#SnapLog.ServerLog().debug("RssRead: Content ID - " + id)
if elem.has_key('summary_detail'):
r = output_view.create_record()
r['id'] = id
r['type'] = unicode(elem.summary_detail.get('type', ''))
r['value'] = unicode(elem.summary_detail.get('value', ''))
r['xml_base'] = unicode(elem.summary_detail.get('base', ''))
r['xml_lang'] = unicode(elem.summary_detail.get('language', ''))
output_view.write_record(r)
#SnapLog.ServerLog().debug("RssRead: Content ID - " + id)
def _write_namespace(self, id, elem):
"""
Write the 'namespace' record.
@param id: The id (foreign key) for the namespace record.
@type id: string
@param elem: The dictionary structure of a feed that may contain the 'namespaces' element.
Only Atom Feed has namespaces attribute.
@type elem: FeedParserDict
"""
if elem.has_key('namespaces'):
# Get the view, this is an optional view ...
try:
output_view = self.output_views['namespace']
except KeyError:
return
# It is a dictionary
for p, u in elem.namespaces.iteritems():
r = output_view.create_record()
r['id'] = id
r['uri'] = unicode(u)
r['prefix'] = unicode(p)
output_view.write_record(r)
#SnapLog.ServerLog().debug("RssRead: Namespace ID - " + id)
def _write_generator(self, id, elem):
"""
Write the 'generator' record.
@param id: The id (foreign key) for the generator record.
@type id: string
@param elem: The dictionary structure of a feed that may contain the 'generator_detail' element.
Only Feed/Channel has generator_detail attribute.
@type elem: FeedParserDict
"""
if elem.has_key('generator_detail'):
# Get the view, this is an optional view ...
try:
output_view = self.output_views['generator']
except KeyError:
return
if elem.generator_detail.get('name', ''):
r = output_view.create_record()
r['id'] = id
r['href'] = unicode(elem.generator_detail.get('href', ''))
r['name'] = unicode(elem.generator_detail.get('name', ''))
r['version'] = unicode(elem.generator_detail.get('version', ''))
output_view.write_record(r)
#SnapLog.ServerLog().debug("RssRead: Generator ID - " + id)
def upgrade_1_0_to_1_1(self):
"""
Upgrade resource from version 1.0 to version 1.1.
In version 1.0 credentials were stored as a single user:passwd string separated by colon.
In version 1.1 it's stored as two separate properties, and password is obfuscated.
Also, change the description of the InputURL property.
The description was changed in release 2.1.0.
"""
# Old credentials were stored as user:password, split them into two variables
credentials = self.get_property_value("credential")
username = None
password = None
if credentials is not None:
# Colons are allowed in passwords, so split it at the first colon
cred_list = credentials.split(':', 1)
if len(cred_list) >= 1:
# If there is a username, it's the first element of the list
username = cred_list[0]
if len(cred_list) == 2:
# If there is a password, it's the second element of the list
password = cred_list[1]
# Delete the old credentials property
self.del_property_def("credential")
# Create the new credentials properties
self.set_property_def("username",
prop.SimpleProp("Credentials: Username", SnapString,
"Username to use if credentials are needed for the accessing data"))
self.set_property_def("password",
prop.SimpleProp("Credentials: Password", SnapString,
"Password to use if credentials are needed for the accessing data",
{'obfuscate' : 0}))
# Set the new credentials properties
self.set_property_value("username", username)
self.set_property_value("password", password)
# Recreate the InputURL property
filename = self.get_property_value("InputURL")
self.del_property_def("InputURL")
self.set_property_def("InputURL", prop.SimpleProp("Input URL", SnapString,
"The URI of the RSS feed. You can enter either a URI pointing to a HTTP or FTP server, or a local file path. The following URI schemes are supported: file, http, https, ftp, file.\n"
"\n\nExamples of valid URIs:\n"
"http://www.server.com/dir/input.file\n"
"https://www.server.com/dir/input.file"
"/home/user/input.file\n"
"c:\dir\input.file\n"
"file:///c:/dir/input.file\n"
"ftp://ftp.server.com/dir/input.file\n"
, None, True))
self.set_property_value("InputURL", filename)
def upgrade_1_1_to_1_2(self):
"""
No-op upgrade only to change component doc URI during the upgrade
which will be by cc_info before calling this method.
"""
pass
|