# $SnapHashLicense:
#
# SnapLogic - Open source data services
#
# Copyright (C) 2009, SnapLogic, Inc. All rights reserved.
#
# See http://www.snaplogic.org for more information about
# the SnapLogic project.
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
#
# "SnapLogic" is a trademark of SnapLogic, Inc.
#
#
# $
# $Id: SnapReader.py 7886 2009-06-15 23:17:56Z dmitri $
"""
This Module contains uniform interfaces for reading data from various input source.
The supported input sources are file, http, https, and ftp.
"""
__docformat__ = "epytext en"
import re, os
import codecs
import urllib2, urlparse
from urllib2 import build_opener,HTTPBasicAuthHandler,HTTPPasswordMgrWithDefaultRealm
from snaplogic.common.snap_exceptions import *
# This is a dictionary to "register" the schemes and their corresponding Reader object.
# To add new (user-provided) reader, the user can insert/update them to this dictionary.
SnapSchemeReader = {
'http': 'snaplogic.common.SnapReader.SnapHttpReader',
'https': 'snaplogic.common.SnapReader.SnapHttpReader',
'ftp': 'snaplogic.common.SnapReader.SnapFtpReader',
'file': 'snaplogic.common.SnapReader.SnapFileReader',
}
"""Dictionary of supported schemes/protocols and the corresponding Reader objects."""
class SnapReader(object):
"""
This class provides interfaces for the users to access an input source. The supported input sources are
file, http, https, and ftp. The class intefaces allow the users to create a reader object, open and close
connection to the input source, and read data from the connection.
"""
def __init__(self, input, username, password, encoding=None, proxy=None):
"""
Initialize internal variables.
@param input: The input source.
@type input: str
@param username: Credentials: username to read the input source
@type username: str, None or empty string, if no credential needed.
@param password: Credentials: password to read the input source
@type password: str, None or empty string, if no credential needed.
@param encoding: The encoding to expect from the source.
@type encoding: str
@param proxy: Proxy server information that may be needed to read
resources at external URIs
@type proxy: str
"""
# Save informaiton parsed from the url
self._input = input
idx = input.find('://')
self._path = input[idx+3:]
self._scheme = input[:idx]
self._encoding = encoding
if username is None:
self._username = ""
else:
self._username = username
if password is None:
self._password = ""
else:
self._password = password
self._proxy = proxy
self._resp = None
if encoding is not None:
# We switch to encoding read
self.read = self._read_encoding
self.readline = self._readline_encoding
self.readlines = self._readlines_encoding
# Properties
input = property(lambda self: self._input)
path = property(lambda self: self._path)
scheme = property(lambda self: self._scheme)
handle = property(lambda self: self._resp)
encoding = property(lambda self: self._encoding)
@staticmethod
def create(input, username=None, password=None, encoding=None, proxy=None):
"""
Create and obtain a reader object for the given input source as specified in the input parameter.
@param input: The input source URL in the format of 'scheme://input_path'.
For example: http://site/input, ftp://site/input, file://input.
@type input: str
@param username: Credentials: username to read the input source
@type username: str, None or empty string, if no credential needed.
@param password: Credentials: password to read the input source
@type password: str, None or empty string, if no credential needed.
@param encoding: The encoding to expect from the source.
@type encoding: str
@param proxy: Proxy server information that may be needed to read
resources at external URIs
@type proxy: str
@return: A reader object.
@rtype: snaplogic.common.SnapReader
@except SnapValueError: on unsupported scheme, invalid reader class.
@except SnapObjTypeError: on invalid type of reader class
"""
scheme = input[:input.find('://')]
if scheme not in SnapSchemeReader.keys():
raise SnapValueError('Unsupported scheme', scheme, input)
# Load the reader object.
rdrpath = SnapSchemeReader[scheme]
paths = rdrpath.split('.')
levels = len(paths)
if levels == 1:
raise SnapValueError('Invalid reader class for scheme %s' % scheme, rdrpath)
modname = '.'.join(paths[0:(levels-1)])
clsname = paths[levels-1]
try:
mod = sys.modules[modname]
cls = getattr(mod, clsname)
except Exception, e:
raise SnapValueError('Invalid reader class for scheme %s' % scheme, rdrpath, str(e))
if type(cls) != type(type):
raise SnapObjTypeError('Invalid type of reader class for scheme %s' % scheme, rdrpath, type(cls))
return cls(input, username, password, encoding, proxy)
def open(self, callback=None, cbdata=None):
"""
Open a connection to the input source.
@param callback: User provided callback that allows the user to create its own reader object.
@type callback: function, or None.
@return: A reader object.
@rtype: snaplogic.common.SnapReader
@except SnapObjTypeError: on callback value None
"""
if callback:
rdr = callback(self._input, self._username, self._password, cbdata)
return rdr
raise SnapObjTypeError('Abstract class error')
def close(self, reader=None):
"""
Close the connection to the input source.
@param reader: The reader object that user created from the callback in open() method.
@type reader: User-created reader object.
@except SnapObjTypeError: on callback value None
"""
raise SnapObjTypeError('Abstract class error')
def parse_url(config, url):
"""
Parse URL and check if we should be using a proxy for this URL.
@param config: CC config section
@param url: URL
@return: tuple (host, no_proxy)
where host is a string containing the hostname including port of the URL,
and no_proxy is a boolean telling us whether proxy should be used for this host.
"""
# Parse the URL using urlparse.
# This returns urlparse.ParseResult which is a subclass of tuple
# and also has extra attributes such as hostname.
parsed_url = urlparse.urlparse(url)
# In the tuple returned "host" includes the port number,
# but parsed_url.hostname doesn't include the port number.
# We need a pure hostname to check against the no_proxy domain list.
(scheme, host, path, params, query, fragment) = parsed_url
hostname = parsed_url.hostname
# Check cc_no_proxy and if it matches the hostname bypass the proxy
no_proxy = False
if config and 'cc_no_proxy' in config:
no_proxy_list = config['cc_no_proxy']
for domain in no_proxy_list.split(','):
domain = domain.strip()
if domain and hostname.endswith(domain):
no_proxy = True
break
return (host, no_proxy)
class SnapFtpReader(SnapReader):
"""
Reader class for ftp input.
"""
def __init__(self, input, username, password, encoding=None, proxy=None):
"""
Initialize internal variables.
@param input: The input source.
@type input: str
@param username: Credentials: username to read the input source
@type username: str, None or empty string, if no credential needed.
@param password: Credentials: password to read the input source
@type password: str, None or empty string, if no credential needed.
@param encoding: The encoding to expect from the source.
@type encoding: str
@param proxy: Proxy server information that may be needed to read
resources at external URIs
@type proxy: str
"""
super(SnapFtpReader, self).__init__(input, username, password, encoding, proxy)
def open(self, callback=None):
"""
Open a connection to the input source.
@param callback: User provided callback that allows the user to create its own reader object.
@type callback: function, or None.
@return: A reader object.
@rtype: snaplogic.common.SnapReader
"""
if callback:
return super(SnapFtpReader, self).open(callback = callback)
proxy_handler = None
if self._proxy:
proxy_server = self._proxy["cc_ftp_proxy"]
proxy_port = self._proxy["cc_ftp_proxy_port"]
if proxy_server != "":
pr = proxy_server + ":" + proxy_port
proxy_handler = urllib2.ProxyHandler({'ftp': pr})
# In case specific proxy information was not set up in the CC section of the
# config file, see if FTP_PROXY environment variable is set and use it.
# TODO: Note that this "if" block may be unnecessary.
# Python urllib2 reads environment variable ftp_proxy (which appears to be case-insensitive).
# This behavior is hardwired into urllib2, so if this variable is assigned, no matter what the code does,
# urllib2 will use the proxy specified by the variable.
if not proxy_handler:
try:
pr = os.environ['FTP_PROXY']
proxy_handler = urllib2.ProxyHandler({'ftp': pr})
except KeyError:
pass
# Parse the URL and check if proxy should be used or not:
# we have a config parameter cc_no_proxy to specify if there are domains
# for which we shouldn't be using a proxy.
(host, no_proxy) = parse_url(self._proxy, self._input)
if no_proxy:
# If no_proxy was returned for the URL set proxy handlers to None.
proxy_handler = None
# Ignore the credential, even if it is provided.
# urllib2.FTPHandler login with empty user name and password (works for anonymous login as well).
if not proxy_handler:
opener = build_opener(urllib2.FTPHandler)
else:
opener = build_opener(proxy_handler, urllib2.FTPHandler)
self._resp = opener.open(self._input)
return self
def _read_encoding(self):
"""
The default read() method is replaced with this method when an encoding is specified.
@return: The content.
@rtype: unicode
"""
return unicode(self._resp.read(), self.encoding)
def read(self):
"""
Read the content from the connection.
@return: The content.
@rtype: str/unicode
"""
return self._resp.read()
def _readline_encoding(self):
"""
The default readline() method is replaced with this method when an encoding is specified.
@return: The next line of the content.
@rtype: unicode
"""
return unicode(self._resp.readline(), self.encoding)
def readline(self):
"""
Read the next line of the content from the connection.
@return: The next line of the content.
@rtype: str/unicode
"""
return self._resp.readline()
def _readlines_encoding(self):
"""
The default readlines() method is replaced with this method when an encoding is specified.
@return: The next line of the content.
@rtype: unicode
"""
return [ unicode(l, self.encoding) for l in self._resp.readlines()]
def readlines(self):
"""
Read the content in lines from the connection.
@return: The lines of the content.
@rtype: list
"""
return self._resp.readlines()
def close(self):
"""
Close the connection.
"""
if self._resp:
self._resp.close()
self._resp = None
class SnapFileReader(SnapReader):
"""
Reader class for file input.
"""
def __init__(self, input, username, password, encoding=None, proxy=None):
"""
Initialize internal variables.
@param input: The input source.
@type input: str
@param username: Credentials: username to read the input source
@type username: str, None or empty string, if no credential needed.
@param password: Credentials: password to read the input source
@type password: str, None or empty string, if no credential needed.
@param encoding: The encoding to expect from the source.
@type encoding: str
@param proxy: Proxy server information that may be needed to read
resources at external URIs
@type proxy: str
"""
super(SnapFileReader, self).__init__(input, username, password, encoding, proxy)
def open(self, callback=None):
"""
Open a connection to the input source.
@param callback: User provided callback that allows the user to create its own reader object.
@type callback: function, or None.
@return: A reader object.
@rtype: snaplogic.common.SnapReader
"""
if callback:
return super(SnapFileReader, self).open(callback = callback)
opener = build_opener(urllib2.FileHandler)
self._resp = opener.open(self._input)
return self
def read(self):
"""
Read the content from the connection.
@return: The content.
@rtype: str
"""
return self._resp.read()
def _read_encoding(self):
"""
The default read() method is replaced with this method when an
encoding is specified.
@return: The content.
@rtype: unicode
"""
return unicode(self._resp.read(), self.encoding)
def _readline_encoding(self):
"""
The default readline() method is replaced with this method when an
encoding is specified.
@return: The next line of the content.
@rtype: unicode
"""
return unicode(self._resp.readline(), self.encoding)
def readline(self):
"""
Read the next line of the content from the connection.
@return: The next line of the content.
@rtype: str
"""
return self._resp.readline()
def _readlines_encoding(self):
"""
The default readlines() method is replaced with this method when an
encoding is specified.
@return: The next line of the content.
@rtype: unicode
"""
return [ unicode(l, self.encoding) for l in self._resp.readlines()]
def readlines(self):
"""
Read the content in lines from the connection.
@return: The lines of the content.
@rtype: list
"""
return self._resp.readlines()
def close(self):
"""
Close the connection.
"""
if self._resp:
self._resp.close()
self._resp = None
class SnapHttpReader(SnapReader):
"""
Reader class for http input.
"""
def __init__(self, input, username, password, encoding=None, proxy=None):
"""
Initialize internal variables.
@param input: The input source.
@type input: str
@param username: Credentials: username to read the input source
@type username: str, None or empty string, if no credential needed.
@param password: Credentials: password to read the input source
@type password: str, None or empty string, if no credential needed.
@param encoding: The encoding to expect from the source.
@type encoding: str
@param proxy: Proxy server information that may be needed to read
resources at external URIs
@type proxy: str
"""
super(SnapHttpReader, self).__init__(input, username, password, encoding, proxy)
def open(self, callback=None):
"""
Open a reader object for HTTP access.
@param callback: User provided callback that allows the user to create its own reader object.
@type callback: function, or None.
@return: A reader object.
@rtype: snaplogic.common.SnapReader
"""
if callback:
return super(SnapHttpReader, self).open(callback = callback)
# Optional proxy handler setup for accessing external URIs
proxy_handler = None
proxy_auth_handler = None
if self._proxy:
proxy_server = self._proxy["cc_http_proxy"]
proxy_port = self._proxy["cc_http_proxy_port"]
if proxy_server != "":
pr = proxy_server + ":" + proxy_port
# NB: support for https over proxy requires patching Python 2.5.
proxy_handler = urllib2.ProxyHandler({'http': pr})
# NB: Authenticating proxy is untested.
# There is also potentially Digest authentication to add later.
proxy_auth_realm = self._proxy["cc_http_proxy_realm"]
proxy_auth_host = self._proxy["cc_http_proxy_host"]
proxy_auth_username = self._proxy["cc_http_proxy_username"]
proxy_auth_password = self._proxy["cc_http_proxy_password"]
if proxy_auth_username != "":
proxy_auth_handler = urllib2.ProxyBasicAuthHandler()
proxy_auth_handler.add_password(proxy_auth_realm, proxy_auth_host, proxy_auth_username, proxy_auth_password)
# In case specific proxy information was not set up in the CC section of the
# config file, see if HTTP_PROXY environment variable is set and use it.
# Note: urllib2 provides default, transparent proxy handler for this (non-authenticating
# proxy only - to use authenticating proxy, config options like cc_http_proxy_realm have
# to be used for the codepath above) case, however we set it explicitly here for symmetry
# and to exercise the below code path in QA system, since there's no way to automatically
# test config setup yet due to bug 1353.
# TODO: Note that this "if" block may be unnecessary.
# Python urllib2 reads environment variable http_proxy (which appears to be case-insensitive).
# This behavior is hardwired into urllib2, so if this variable is assigned, no matter what the code does,
# urllib2 will use the proxy specified by the variable.
if not proxy_handler:
try:
pr = os.environ['HTTP_PROXY']
proxy_handler = urllib2.ProxyHandler({'http': pr})
except KeyError:
pass
# Parse the URL and check if proxy should be used or not:
# we have a config parameter cc_no_proxy to specify if there are domains
# for which we shouldn't be using a proxy.
(host, no_proxy) = parse_url(self._proxy, self._input)
if no_proxy:
# If no_proxy was returned for the URL set proxy handlers to None.
proxy_handler = None
proxy_auth_handler = None
if self._username or self._password:
# Authenticated connection
# Use default realm
mgr = HTTPPasswordMgrWithDefaultRealm()
mgr.add_password(None, host, self._username, self._password)
# Create an OpenerDirector with support for Basic HTTP Authentication...
if not proxy_handler:
opener = build_opener(HTTPBasicAuthHandler(mgr))
elif not proxy_auth_handler:
opener = build_opener(proxy_handler, HTTPBasicAuthHandler(mgr))
else:
opener = build_opener(proxy_handler, proxy_auth_handler, HTTPBasicAuthHandler(mgr))
self._resp = opener.open(self._input)
else:
# Plain connection without authentication
if not proxy_handler:
opener = build_opener()
elif not proxy_auth_handler:
opener = build_opener(proxy_handler)
else:
opener = build_opener(proxy_handler, proxy_auth_handler)
self._resp = opener.open(self._input)
return self
def _read_encoding(self):
"""
The default read() method is replaced with this method when an
encoding is specified.
@return: The content.
@rtype: unicode
"""
return unicode(self._resp.read(), self.encoding)
def read(self):
"""
Read the content from the connection.
@return: The content.
@rtype: str/unicode
"""
return self._resp.read()
def _readline_encoding(self):
"""
The default read() method is replaced with this method when an
encoding is specified.
@return: The next line of the content.
@rtype: unicode
"""
return unicode(self._resp.readline(), self.encoding)
def readline(self):
"""
Read the next line of the content from the connection.
@return: The next line of the content.
@rtype: str
"""
return self._resp.readline()
def _readlines_encoding(self):
"""
The default read() method is replaced with this method when an
encoding is specified.
@return: The next line of the content.
@rtype: unicode
"""
return [ unicode(l, self.encoding) for l in self._resp.readlines()]
def readlines(self):
"""
Read the content in lines from the connection.
@return: The lines of the content.
@rtype: list
"""
return self._resp.readlines()
def close(self):
"""
Close the connection.
"""
if self._resp:
self._resp.close()
self._resp = None
|