# $SnapHashLicense:
#
# SnapLogic - Open source data services
#
# Copyright (C) 2008-2009, SnapLogic, Inc. All rights reserved.
#
# See http://www.snaplogic.org for more information about
# the SnapLogic project.
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
#
# "SnapLogic" is a trademark of SnapLogic, Inc.
#
#
# $
# $Id: http_request.py 6314 2009-02-11 01:07:59Z grisha $
"""
The HttpRequest class.
We do not want to expose too much of WSGI specifics in the internals of
our system. Therefore, details of the HTTP request - which have been
presented to us in a WSGI specific manner - are translated into a more
neutral and convenient form.
For this purpose, we introduce the HttpRequest class. It will contain
specific members for the various interesting HTTP headers. All strings
will be properly decoded, and parameters will have been parsed.
An instance of this object can then be passed to the various API functions
of the other modules in the system, allowing them access to request data,
without spreading exposure to WSGI any further.
Why do we want to limit exposure to WSGI? Because WSGI is our choice now,
but may not be it in the future...
The object can also be used to gain access to the input and output streams
of the request, to set response codes, etc.
"""
from urllib import unquote,unquote_plus
import httplib
from snaplogic import rp
from snaplogic.common.snap_exceptions import SnapException,SnapNativeError,SnapNotAllowedError,SnapPluginError
from snaplogic.common import headers
# Here is the mapping of HTTP status code to full status text.
# Rather than typing this all out ourselves, we can just use whatever
# was defined already in some other place. For example in httplib.
_http_status_codes = httplib.responses
# For the following user agents we will return nicely formatted, human
# readable output.
_human_readable_agents = [ 'mozilla', 'msie', 'opera', 'googlebot', 'slurp', 'msnbot' ]
class _WriteStream(object):
"""
This is a class, which provides a 'file like' interface to the WSGI
output stream. WSGI provides a callable in order to write output,
but a file object to read input. This is a bit asymmetric. Since
it is our goal to isolate the rest of the system from WSGI specifics,
we are introducing this abstraction here, which allows the user of
the HttpRequest object to see input and output both as objects.
Currently, the only 'file like' method defined on this object is
'write'. More may come in the future.
"""
def __init__(self, write_callable):
"""
Initialize the write-stream object.
All we need to do here is to store the WSGI callable for the
writing of output.
@param write_callable: WSGI callable for the writing of output.
@type write_callable: callable
"""
self._write_callable = write_callable
def write(self, data):
"""
Write data to the output stream.
The presence of this function allows the user of the HttpRequest
object to see output as an object, rather than a callable. This
makes the read/write interface more symmetric.
"""
self._write_callable(data)
class HttpRequest(object):
"""
Convenient and generic HTTP request representation.
Access to HTTP request headers
------------------------------
A selected set of HTTP headers is exposed via this object. All strings
are decoded, parameters are parsed.
The following HTTP request header lines are exposed as attributes of the
created object:
method
path
content_length
client_address
params
server_name
server_port
script_name
http_accept
http_accept_encoding
http_cache_control
http_host
If a particular header line or request object attribute are not set or defined,
the value will be set to None (except the dictionary for parameters, which is
always present, but which may be empty).
Parameters are translated into an already decoded and parsed dictionary. Note
that it is possible to define parameters in a URL, which are NOT of the form
'foo = bar'. So, for example, it is possible to write: ?foo=123&bar&xyz...
In that case, 'bar' doesn't have a value, it simple 'is'. In that case, the
'value' of 'bar', which we will write into the params dictionary, is None.
All other values are always stored as strings.
If no parameters at all are defined, params will be set to the empty dictionary.
Note: For performance optimization it might be useful to perform a lazy
decoding and parsing. So, only if something requiring those steps is actually
accessed will the operation be performed. However, that's an optimization we
can leave for later.
In addition, snap_user is also exposed, and contains the identified user name
or None, in case of an anonymous request.
Access to input and output streams
----------------------------------
The '_input' property of the object gives access to the input file object of
the request. Thus, the usual read(), etc. methods are defined on that object.
Note that the user of the 'input' file object should NOT call close().
The request object contains the 'content_length' attribute. This is set to
the value that was indicated in the HTTP request header of the same name.
If that header wasn't provided, then the value of this attribute is None.
The '_output' property of the object is used to write information to the output
stream. It currently only supports a write() method.
Note that access to the output stream is ONLY possible AFTER the response
headers have been set with the send_response_headers() method. Until that has
happend, the 'output' object doesn't exist at all, and its value is None.
For convenience, we provide a functions here to make and initialize the RP
for the input stream, taking care of all the right content negotiation.
This is the make_input_rp() function.
The readers and writers of those RPs are then accessible via the
'input' and 'output' properties (as compared to '_input' and '_output', which
represent the raw streams). On the RP objects, you have a 'next()' method
defined on 'input' and a 'write()' method on 'output'. Since those are RPs,
the next() method returns an actual Python object, and the write() method
takes an actual Python object as input.
The actual RPs are available as '_input_rp' and '_output_rp', respectively.
Type of client
--------------
The object attempts to decide whether the request was made by a human via
a web browser. For that purpose, it examines the user-agent string. The
object exposes a 'human_request' attribute, which is either True or False
depending on the outcome of this analysis.
"""
# We also would like to have a constant that is defined for the various
# return codes. httplib provides that as well, but not in an easily
# usable form, just as module attributes. Yuck! So, here we have written
# them out again.
CONTINUE = 100
SWITCHING_PROTOCOLS = 101
PROCESSING = 102
OK = 200
CREATED = 201
ACCEPTED = 202
NON_AUTHORITATIVE_INFORMATION = 203
NO_CONTENT = 204
RESET_CONTENT = 205
PARTIAL_CONTENT = 206
MULTI_STATUS = 207
MULTIPLE_CHOICES = 300
MOVED_PERMANENTLY = 301
FOUND = 302
SEE_OTHER = 303
NOT_MODIFIED = 304
USE_PROXY = 305
TEMPORARY_REDIRECT = 307
BAD_REQUEST = 400
UNAUTHORIZED = 401
PAYMENT_REQUIRED = 402
FORBIDDEN = 403
NOT_FOUND = 404
METHOD_NOT_ALLOWED = 405
NOT_ACCEPTABLE = 406
PROXY_AUTHENTICATION_REQUIRED = 407
REQUEST_TIMEOUT = 408
CONFLICT = 409
GONE = 410
LENGTH_REQUIRED = 411
PRECONDITION_FAILED = 412
REQUEST_ENTITY_TOO_LARGE = 413
REQUEST_URI_TOO_LONG = 414
UNSUPPORTED_MEDIA_TYPE = 415
REQUESTED_RANGE_NOT_SATISFIABLE = 416
EXPECTATION_FAILED = 417
UNPROCESSABLE_ENTITY = 422
LOCKED = 423
FAILED_DEPENDENCY = 424
UPGRADE_REQUIRED = 426
INTERNAL_SERVER_ERROR = 500
NOT_IMPLEMENTED = 501
BAD_GATEWAY = 502
SERVICE_UNAVAILABLE = 503
GATEWAY_TIMEOUT = 504
HTTP_VERSION_NOT_SUPPORTED = 505
INSUFFICIENT_STORAGE = 507
NOT_EXTENDED = 510
# Header used for multi-POST SnapStream after WSGI translation
SNAPSTREAM_CONTINUED = 'HTTP_' + headers.SNAPSTREAM_CONTINUED.upper().replace('-', '_')
# This class variable here is used to tell the HttpRequest instances
# whether they will be allowed to consider a request as human-readable
# (someone with a browser making the request). Setting this to False
# makes sense for CCs, which should never be contacted by browsers
# directly, and which don't have all the necessary config information
# to display human readable output anyway.
allow_human_readable = True
def __init__(self, env, start_response):
"""
Create an instance of HttpRequest based on the WSGI request object.
Takes the WSGI request object and makes it available via the HttpRequest
object, performing any decoding along the way as necessary.
This object also provides access to other aspects of the WSGI request,
such as: Reading data from the input stream, setting response headers
and writing output.
@param env: The WSGI request object.
@type env: dictionary
@param start_response: A callable provided by WSGI, which can be used
to write the response.
@type start_response: callable
"""
pi = env['PATH_INFO']
qs = env['QUERY_STRING']
if qs:
self.raw_uri = "%s?%s" % (pi, qs)
else:
self.raw_uri = pi
self.method = env['REQUEST_METHOD']
# The unquoted path may have been added by our authentication module (if we are running
# in the main server. In that case, we can save ourselves unquoting the same string a
# second time. But if we are on a CC here, then this hasn't happened, and we just do it
# ourselves.
self.path = env['SNAP_UNQ_PATH'] if env.has_key('SNAP_UNQ_PATH') else unquote(pi)
self.content_length = int(env['CONTENT_LENGTH'])
self.client_address = env['REMOTE_ADDR']
self.server_name = env['SERVER_NAME']
self.server_port = int(env['SERVER_PORT'])
self.script_name = unquote(env['SCRIPT_NAME']) if env.has_key('SCRIPT_NAME') else None
self.http_accept = unquote(env['HTTP_ACCEPT']) if env.has_key('HTTP_ACCEPT') else '*/*'
self.http_content_type = unquote(env['CONTENT_TYPE']) if env.has_key('CONTENT_TYPE') else 'application/json'
self.http_user_agent = unquote(env['HTTP_USER_AGENT']) if env.has_key('HTTP_USER_AGENT') else None
self.http_accept_encoding = unquote(env['HTTP_ACCEPT_ENCODING']) if env.has_key('HTTP_ACCEPT_ENCODING') else None
self.http_cache_control = env['HTTP_CACHE_CONTROL'].lower() if env.has_key('HTTP_CACHE_CONTROL') else None
self.http_host = env.get('HTTP_HOST', None)
self.snap_stream_continue = unquote(env[self.SNAPSTREAM_CONTINUED]) if env.has_key(self.SNAPSTREAM_CONTINUED) else None
self.snapi_headers = self._retrieve_snapi_headers(env)
self.username = env['snap_username'] if env.has_key('snap_username') else None
self.groups = env['snap_groups'] if env.has_key('snap_groups') else None
self.permissions = env['snap_permissions'] if env.has_key('snap_permissions') else None
self.invoker = env.get(headers.WSGI_INVOKER_HEADER)
# We examine the user agent to decide whether this is a human making the request
# (via a browser). For search engines, we also want to return the human readable
# form, because it has more unique elements one can search for. Also, search
# engines are used by humans, and thus they can benefit from nicely formatted
# pages in previews or caches.
self.human_request = False
if HttpRequest.allow_human_readable:
if self.http_user_agent:
ua = self.http_user_agent.lower()
for a in _human_readable_agents:
if a in ua:
self.human_request = True
break
# We will provide an RP for input and output, but here we store
# the raw input and output sockets/methods/objects.
self._input = env['wsgi.input']
self._output = None
self._start_response = start_response
self._input_rp = None
self._output_rp = None
# These are the references to the RP readers and writers for the input and output.
self.input = None
self.output = None
if qs == '':
self.params = {}
return
params = qs.split("&")
# Here we parse the parameters into a dictionary. If no value was specified
# for a parameter in the URL we will set the value to None. Special characters
# may have been encoded in the parameter names or values, which is why we now
# need to perform the decoding ('unquoting') here.
d = {}
for p in params:
l = p.split("=")
key = unquote_plus(l[0])
if len(l) == 1:
d[key] = None
else:
d[key] = unquote_plus(l[1])
self.params = d
self.raw_params = qs
def _retrieve_snapi_headers(self, header):
"""
Given an HTTP header, return a dictionary of Snapi-specific headers,
stripping out L{SNAPI_HEADER_PREFIX}. For instance, if
{'X-Snapi-Foo':'bar', 'Content-Length' : '37'} is provided,
{'Foo' : 'bar'} is returned.
@param header: HTTP header
@type header: dict
@return: dictionary of snapi-specific keys and values
@rtype: dict
"""
result = {}
snapi_prefix = "HTTP_" + headers.SNAPI_HEADER_PREFIX.replace('-','_').upper()
for key in header.keys():
if key.startswith(snapi_prefix):
result[key[len(snapi_prefix):]] = header[key]
return result
def make_input_rp(self):
"""
Create the RP for the input stream.
The input content type (only applies to POST and PUT requests)
is indicated via the content-type header. If we cannot match
the indicated content type against an available RP, then the
'input' property will be left at None. Users of the http_req
object may use '_input' to read the raw input stream.
"""
self._input_rp = rp.get_rp(self.http_content_type)
#self._input_rp = rp.get_rp("application/json") ### FIXIT! For debugging.
if self._input_rp:
self.input = self._input_rp.Reader(self._input)
def get_input_content_type(self):
"""
Return the agreed upon content type for the input.
This returns the string that was used to determine the content
type and thus, the RP. For example, a 'content-type' HTTP header
can contain a large number of suggested content types, only one
of which will be chosen by get_rp(). The string that represents
the one that was chosen by get_rp() will be returned here.
This is useful when we want to create the right HTTP return
headers, for example.
If the input RP hasn't been chosen, or no content type exists,
then None is returned.
@return: The chosen content type.
@rtype: string or None
"""
if self._input_rp:
return self._input_rp.CONTENT_TYPE
else:
return None
def get_output_content_type(self):
"""
Return the agreed upon content type for the output.
This returns the string that was used to determine the content
type and thus, the RP. For example, an 'accept' HTTP header may
contain a large number of suggested content types, only one
of which will be chosen by get_rp(). The string that represents
the one that was chosen by get_rp() will be returned here.
This is useful when we want to create the right HTTP return
headers, for example.
If the output RP hasn't been chosen, or no content type exists,
then None is returned.
@return: The chosen content type.
@rtype: string or None
"""
if self._output_rp:
return self._output_rp.CONTENT_TYPE
else:
return None
def response_already_sent(self):
"""
Return True of the response headers have been sent already.
The HTTP response code and headers can only be sent once. This
method provides an easy test for users of http_req to see if
this has happened already.
@return: Flag indicating whether the response has been
sent already.
@rtype: bool
"""
return True if self._output else False
def send_response_headers(self, code, headers=None, init_output=True, options=None):
"""
Send response headers and create output stream object.
This is called to set any HTTP response headers and also set the HTTP
response code. The code is simply a number, indicating the well-known
HTTP status codes, such as 200, 404, etc.
Headers are defined as a list of tuples, such as:
[('Content-type', 'text/plain'), ]
Always one tuple per header.
@param code: HTTP status code.
@type code: integer
@param headers: List of tuples, specifying the HTTP response headers
and their values.
@type headers: list
@param init_output: If this flag is specified then we attempt to create
the output RP after sending the response headers.
The 'output' property may remain None, however, if
the content negotiation failed.
The acceptable output content type is indicated via the Accept
header. If we cannot match the indicated content type against
an available RP, then the 'output' property will be left at None.
Users of the http_req object may use '_output' to write to the
raw output stream.
@type init_output: bool
@param options: A dictionary with options for any RPs that are initialized.
@type options: dict
"""
if self._output:
raise SnapNotAllowedError("The HTTP response headers can only be sent once.")
if headers is None:
headers = []
if init_output:
try:
# Caller wants us to set up the output RP...
if self.human_request:
# IE for some reason doesn't indicate text/html as an acceptable
# content type, only '*/*' and other junk. So, if the human flag
# is set, we just have to manually choose the appropriate content
# type. Otherwise, we would default to 'application/x-snap-asn1' and IE
# wouldn't like that.
self._output_rp = rp.get_rp("text/html")
# Add option for RP indicating the request URI.
options = options if options else dict()
options["request_uri"] = "http://" + self.http_host + "/"
else:
self._output_rp = rp.get_rp(self.http_accept)
if not self._output_rp:
raise ValueError()
# ... and when this is successful, we can also specify the
# right content type for the output in the headers, while
# we are at it.
headers.append( ('content-type', self.get_output_content_type()))
except:
# ... but not matching RP could be found, which means we have
# to signal an error to the client.
code = HttpRequest.UNSUPPORTED_MEDIA_TYPE
headers = []
_http_codes = globals()['_http_status_codes']
self._output = _WriteStream(self._start_response("%d %s" % (code, _http_codes[code]), headers))
if init_output:
if code == HttpRequest.UNSUPPORTED_MEDIA_TYPE:
# If we were not able to create the proper RP, we need to
# signal that to the client with an error, and to the caller
# with an exception. The output stream needs to collapse again.
self._output = None
raise SnapPluginError("Cannot find media type for acceptable output: '%s'" % (self.http_accept))
else:
self.output = self._output_rp.Writer(self._output, None, self.human_request, options)
self.output.initialize()
def __repr__(self):
"""
Returns the HTTP request object's string representation.
@return: String representation of the HttpRequest object.
Note that this is not well optimized. Shouldn't be a problem, since
this mostly only used for debugging anyway.
"""
s = "METHOD: %s\n" % self.method
s += "PATH: %s\n" % self.path
s += "PARAMS: %s\n" % self.params
s += "RAW_URI: %s\n" % self.raw_uri
s += "CONTENT_LENGTH: %d\n" % self.content_length
s += "CLIENT_ADDRESS: %s\n" % self.client_address
s += "SERVER_NAME: %s\n" % self.server_name
s += "SERVER_PORT: %s\n" % self.server_port
s += "SCRIPT_NAME: %s\n" % self.script_name
s += "HTTP_ACCEPT: %s\n" % self.http_accept
s += "HTTP_CONTENT_TYPE: %s\n" % self.http_content_type
s += "HTTP_ACCEPT_ENCODING: %s\n" % self.http_accept_encoding
s += "HTTP_CACHE_CONTROL: %s\n" % self.http_cache_control
s += "HTTP_HOST: %s\n" % self.http_host
s += "USER_AGENT: %s\n" % self.http_user_agent
s += "SNAP_STREAM_CONTINUE: %s\n" % self.snap_stream_continue
s += "SNAPI_HEADERS: %s\n" % self.snapi_headers
s += "USERNAME: %s\n" % self.username
s += "GROUPS: %s\n" % self.groups
s += "PERMISSIONS: %s\n" % self.permissions
s += "HUMAN_REQUEST: %s" % self.human_request
return s
|