"""URLParser
URL parsing is done through objects which are subclasses of the `URLParser`
class. `Application` delegates most of the URL parsing to these objects.
Application has a single "root" URL parser, which is used to parse all URLs.
This parser then can pass the request on to other parsers, usually taking off
parts of the URL with each step.
This root parser is generally `ContextParser`, which is instantiated and set
up by `Application` (accessible through `Application.rootURLParser`).
"""
import re, os, sys
from HTTPExceptions import HTTPNotFound,HTTPMovedPermanently
from MiscUtils.ParamFactory import ParamFactory
from WebUtils.Funcs import urlDecode
debug = 0
try: # for Python < 2.1
from warnings import warn
except ImportError:
def warn(message, **args):
print message
# Legal characters for use in a module name -- used when turning
# an entire path into a module name.
_moduleNameRE = re.compile('[^a-zA-Z_]')
_globalApplication = None
def application():
"""Returns the global Application."""
return _globalApplication
class URLParser:
"""URLParser is the base class for all URL parsers.
Though its functionality is sparse, it may be expanded in the future.
Subclasses should implement a `parse` method, and may also want to
implement an `__init__` method with arguments that control how the
parser works (for instance, passing a starting path for the parser)
The `parse` method is where most of the work is done. It takes two
arguments -- the transaction and the portion of the URL that is still to
be parsed. The transaction may (and usually is) modified along the way.
The URL is passed through so that you can take pieces off the front,
and then pass the reduced URL to another parser. The method should return
a servlet (never None).
If you cannot find a servlet, or some other (somewhat) expected error
occurs, you should raise an exception. HTTPNotFound probably being the
most interesting.
"""
def __init__(self):
pass
def findServletForTransaction(self, trans):
"""Returns a servlet for the transaction.
This is the top-level entry point, below it `parse` is used.
"""
return self.parse(trans, trans.request().urlPath())
class ContextParser(URLParser):
"""Find the context of a request.
ContextParser uses the ``Application.config`` context settings to find
the context of the request. It then passes the request to a FileParser
rooted in the context path.
The context is the first element of the URL, or if no context matches
that then it is the ``default`` context (and the entire URL is passed
to the default context's FileParser).
There is generally only one ContextParser, which can be found as
``application.rootURLParser()``.
"""
## Init ##
def __init__(self, app):
"""Create ContextParser.
ContextParser is usually created by Application, which
passes all requests to it.
In __init__ we take the ``Contexts`` setting from
Application.config and parse it slightly.
"""
URLParser.__init__(self)
# Need to set this here because we need for initialization, during
# which AppServer.globalAppServer.application() doesn't yet exist:
self._app = app
self._imp = app._imp
# self._context will be a dictionary of context names and context
# directories. It is set by `addContext`.
self._contexts = {}
# add all contexts except the default, which we save until the end
contexts = app.setting('Contexts')
defaultContext = ''
for name, dir in contexts.items():
dir = os.path.normpath(dir) # for Windows
if name == 'default':
defaultContext = dir
else:
name = '/'.join(filter(lambda x: x, name.split('/')))
self.addContext(name, dir)
if not defaultContext:
# If no default context has been specified, and there is a unique
# context not built into Webware, use it as the default context.
for name in contexts.keys():
if name.endswith('/Docs') or name in (
'Admin', 'Docs', 'Examples', 'MKBrowser', 'Testing'):
continue
if defaultContext:
defaultContext = None
break
else:
defaultContext = name
if not defaultContext:
# otherwise, try using the following contexts if available
for defaultContext in ('Default', 'Examples', 'Docs'):
if contexts.has_key(defaultContext):
break
else: # if not available, refuse the tempatation to guess
raise KeyError, "No default context has been specified."
if self._contexts.has_key(defaultContext):
self._defaultContext = defaultContext
else:
for name, dir in self._contexts.items():
if defaultContext == dir:
self._defaultContext = name
break
else:
self.addContext('default', defaultContext)
self._defaultContext = 'default'
## Context handling ##
def resolveDefaultContext(self, name, dest):
"""Find default context.
Figure out if the default context refers to an existing context,
the same directory as an existing context, or a unique directory.
Returns the name of the context that the default context refers to,
or 'default' if the default context is unique.
"""
contexts = self._contexts
contextDirs = {}
# make a list of existing context paths
for name, path in contexts.items():
if name != 'default':
contextDirs[self.absContextPath(path)] = name
if contexts.has_key(dest):
# The default context refers to another context,
# not a unique context. Return the name of that context.
return dest
elif contextDirs.has_key(self.absContextPath(dest)):
# The default context has the same directory
# as another context, so it's still not unique
return contextDirs[self.absContextPath(dest)]
else:
# The default context has no other name
return 'default'
def addContext(self, name, dir):
"""Add a context to the system.
The context will be imported as a package, going by `name`,
from the given directory. The directory doesn't have to match
the context name.
"""
if name == 'default':
dest = self.resolveDefaultContext(name, dir)
self._defaultContext = dest
if dest != 'default':
# in this case default refers to an existing context, so
# there's not much to do
print 'Default context aliases to: %s' % (dest)
return
e = None
try:
importAsName = name
localDir, packageName = os.path.split(dir)
if sys.modules.has_key(importAsName):
mod = sys.modules[importAsName]
else:
try:
res = self._imp.find_module(packageName, [localDir])
if not res:
raise ImportError
except ImportError, e:
if not str(e):
e = 'Could not import package'
# Maybe this happened because it had been forgotten
# to add the __init__.py file. So we try to create one:
if os.path.exists(dir):
f = os.path.join(dir, '__init__.py')
if not os.path.exists(f):
print 'Creating ' \
'__init__.py file for context:', name
try:
open(f, 'w').write(
'# Auto-generated by WebKit' + os.linesep)
except Exception:
print 'Error: ' \
'__init__.py file could not be created.'
else:
res = self._imp.find_module(packageName,
[localDir])
if res:
e = None
if e:
raise
mod = self._imp.load_module(name, *res)
except (ImportError, TypeError), e:
# TypeError can be raised by imp.load_module()
# when the context path does not exist
pass
if e:
print 'Error loading context: %s: %s: dir=%s' % (name, e, dir)
return
if hasattr(mod, 'contextInitialize'):
# @@ gat 2003-07-23: switched back to old method
# of passing application as first parameter
# to contextInitialize for backward compatibility
result = mod.contextInitialize(application(),
os.path.normpath(os.path.join(os.getcwd(), dir)))
# @@: funny hack...?
if result is not None and result.has_key('ContentLocation'):
dir = result['ContentLocation']
print 'Loading context: %s at %s' % (name, dir)
self._contexts[name] = dir
def absContextPath(self, path):
"""Get absolute context path.
Resolves relative paths, which are assumed to be relative to the
Application's serverSidePath (the working directory).
"""
if os.path.isabs(path):
return path
else:
return self._app.serverSidePath(path)
## Parsing ##
def parse(self, trans, requestPath):
"""Parse request.
Get the context name, and dispatch to a FileParser rooted
in the context's path.
The context name and file path are stored in the request (accessible
through `Request.serverSidePath` and `Request.contextName`).
"""
# This is a hack... should probably go in the Transaction class:
trans._fileParserInitSeen = {}
# If there is no path, redirect to the root path:
req = trans.request()
if not requestPath:
p = req.servletPath() + '/'
q = req.queryString()
if q:
p += "?" + q
raise HTTPMovedPermanently(location=p)
# Determine the context name:
if req._absolutepath:
contextName = self._defaultContext
else:
context = filter(None, requestPath.split('/'))
if requestPath.endswith('/'):
context.append('')
parts = []
while context:
contextName = '/'.join(context)
if self._contexts.has_key(contextName):
break
parts.insert(0, context.pop())
if context:
if parts:
parts.insert(0, '')
requestPath = '/'.join(parts)
else:
requestPath = ''
else:
contextName = self._defaultContext
context = self._contexts[contextName]
req._serverSideContextPath = context
req._contextName = contextName
fpp = FileParser(context)
return fpp.parse(trans, requestPath)
class _FileParser(URLParser):
"""Parse requests to the filesystem.
FileParser dispatches to servlets in the filesystem, as well as providing
hooks to override the FileParser.
FileParser objects are threadsafe. A factory function is used to cache
FileParser instances, so for any one path only a single FileParser instance
will exist. The `_FileParser` class is the real class, and `FileParser` is
a factory that either returns an existant _FileParser object, or creates a
new one if none exists.
FileParser uses several settings from ``Application.config``, which are
persistent over the life of the application. These are set up in the
function `initApp`, as class variables. They cannot be set when the module
is loaded, because the Application is not yet set up, so `initApp` is
called in `Application.__init__`.
"""
## Init ##
def __init__(self, path):
"""Create a FileParser.
Each parsed directory has a FileParser instance associated with it
(``self._path``).
"""
URLParser.__init__(self)
self._path = path
self._initModule = None
## Parsing ##
def parse(self, trans, requestPath):
"""Return the servlet.
__init__ files will be used for various hooks
(see `parseInit` for more).
If the next part of the URL is a directory, it calls
``FileParser(dirPath).parse(trans, restOfPath)`` where ``restOfPath``
is `requestPath` with the first section of the path removed (the part
of the path that this FileParser just handled).
This uses `fileNamesForBaseName` to find files in its directory.
That function has several functions to define what files are ignored,
hidden, etc. See its documentation for more information.
"""
if debug:
print "FP(%r) parses %r" % (self._path, requestPath)
req = trans.request()
if req._absolutepath:
name = req._fsPath
restPart = req._extraURLPath
else:
# First decode the URL, since we are dealing with filenames here:
requestPath = urlDecode(requestPath)
result = self.parseInit(trans, requestPath)
if result is not None:
return result
assert not requestPath or requestPath.startswith('/'), \
"Not what I expected: %s" % repr(requestPath)
if not requestPath or requestPath == '/':
return self.parseIndex(trans, requestPath)
parts = requestPath[1:].split('/', 1)
nextPart = parts[0]
restPart = len(parts) > 1 and '/' + parts[1] or ''
baseName = os.path.join(self._path, nextPart)
if restPart and not self._extraPathInfo:
names = [baseName]
else:
names = self.filenamesForBaseName(baseName)
if len(names) > 1:
warn("More than one file matches %s in %s: %s"
% (requestPath, self._path, names))
raise HTTPNotFound("Page is ambiguous")
elif not names:
return self.parseIndex(trans, requestPath)
name = names[0]
if os.path.isdir(name):
# directories are dispatched to FileParsers
# rooted in that directory
fpp = FileParser(name)
return fpp.parse(trans, restPart)
req._extraURLPath = restPart
if not self._extraPathInfo and req._extraURLPath:
raise HTTPNotFound("Invalid extra path info: %s")
req._serverSidePath = name
return ServletFactoryManager.servletForFile(trans, name)
def filenamesForBaseName(self, baseName):
"""Find all files for a given base name.
Given a path, like ``/a/b/c``, searches for files in ``/a/b``
that start with ``c``. The final name may include an extension,
which is less ambiguous; though if you ask for file.html,
and file.html.py exists, that file will be returned.
If more than one file is returned for the basename, you'll
get a 404.
Some settings are used to control this. All settings are
in ``Application.config``:
FilesToHide:
These files will be ignored, and even given a full
extension will not be used. Takes a glob.
FilesToServe:
If set, *only* files matching these globs will be
served, all other files will be ignored.
ExtensionsToIgnore:
Files with these extensions will be ignored, but if a
complete filename (with extension) is given the file
*will* be served (unlike FilesToHide). Extensions are
in the form ``".py"``
ExtensionsToServe:
If set, only files with these extensions will be
served. Like FilesToServe, only doesn't use globs.
UseCascadingExtensions:
If true, then extensions will be prioritized. So if
extension ``.tmpl`` shows up in ExtensionCascadeOrder
before ``.html``, then even if filenames with both
extensions exist, only the .tmpl file will be returned.
ExtensionCascadeOrder:
A list of extensions, ordered by priority.
"""
if baseName.find('*') != -1:
return []
fileStart = os.path.basename(baseName)
dir = os.path.dirname(baseName)
filenames = []
dirnames = []
for filename in os.listdir(dir):
if filename.startswith('.'):
continue
elif filename == fileStart:
if os.path.isdir(os.path.join(dir, filename)):
dirnames.append(os.path.join(dir, filename))
else:
filenames.append(os.path.join(dir, filename))
elif filename.startswith(fileStart) \
and os.path.splitext(filename)[0] == fileStart:
filenames.append(os.path.join(dir, filename))
good = dirnames
# Here's where all the settings (except cascading) come into play --
# we filter the possible files based on settings here:
for filename in filenames:
ext = os.path.splitext(filename)[1]
shortFilename = os.path.basename(filename)
if ext in self._toIgnore and filename != baseName:
continue
if self._toServe and ext not in self._toServe:
continue
shouldServe = 1
for regex in self._filesToHideRegexes:
if regex.match(shortFilename):
shouldServe = 0
break
if not shouldServe:
continue
if self._filesToServeRegexes:
shouldServe = 0
for regex in self._filesToServeRegexes:
if regex.match(shortFilename):
shouldServe = 1
break
if not shouldServe:
continue
good.append(filename)
if self._useCascading and len(good) > 1:
actualExtension = os.path.splitext(baseName)[1]
for extension in self._cascadeOrder:
if baseName + extension in good \
or extension == actualExtension:
return [baseName + extension]
return good
def parseIndex(self, trans, requestPath):
"""Return index servlet.
Return the servlet for a directory index (i.e., ``Main`` or
``index``). When `parse` encounters a directory and there's nothing
left in the URL, or when there is something left and no file matches
it, then it will try `parseIndex` to see if there's an index file.
That means that if ``/a/b/c`` is requested, and in ``/a`` there's no
file or directory matching ``b``, then it'll look for an index file
(like ``Main.py``), and that servlet will be returned. In fact, if
no ``a`` was found, and the default context had an index (like
``index.html``) then that would be called with ``/a/b/c`` as
`HTTPRequest.extraURLPath`. If you don't want that to occur, you
should raise an HTTPNotFound in your no-extra-url-path-taking servlets.
The directory names are based off the ``Application.config`` setting
``DirectoryFile``, which is a list of base names, by default
``["Main", "index", "main", "Index"]``, which are searched in order.
A file with any extension is allowed, so the index can be an HTML file,
a PSP file, a Kid template, a Python servlet, etc.
"""
req = trans.request()
# If requestPath is empty, then we're missing the trailing slash:
if not requestPath:
p = req.serverURL() + '/'
q = req.queryString()
if q:
p += "?" + q
raise HTTPMovedPermanently(location=p)
if requestPath == '/':
requestPath = ''
for directoryFile in self._directoryFile:
basename = os.path.join(self._path, directoryFile)
names = self.filenamesForBaseName(basename)
if len(names) > 1 and self._useCascading:
for ext in self._cascadeOrder:
if basename + ext in names:
names = [basename + ext]
break
if len(names) > 1:
warn("More than one file matches the index file %s in %s: %s"
% (directoryFile, self._path, names))
raise HTTPNotFound("Index page is ambiguous")
if names:
if requestPath and not self._extraPathInfo:
raise HTTPNotFound
req._serverSidePath = names[0]
req._extraURLPath = requestPath
return ServletFactoryManager.servletForFile(trans, names[0])
raise HTTPNotFound("Index page not found")
def initModule(self):
"""Get the __init__ module object for this FileParser's directory."""
path = self._path
# if this directory is a context, return the context package
for context, dir in self._app.contexts().items():
if dir == path:
# avoid reloading of the context package
return sys.modules[context]
name = 'WebKit_Cache_' + _moduleNameRE.sub('_', path)
try:
file, path, desc = self._imp.find_module('__init__', [path])
module = self._imp.load_module(name, file, path, desc)
return module
except (ImportError, TypeError):
pass
def parseInit(self, trans, requestPath):
"""Parse the __init__ file.
Returns the resulting servlet, or None if no __init__ hooks were found.
Hooks are put in by defining special functions or objects in your
__init__, with specific names:
`urlTransactionHook`:
A function that takes one argument (the transaction).
The return value from the function is ignored. You
can modify the transaction with this function, though.
`urlRedirect`:
A dictionary. Keys in the dictionary are source
URLs, the value is the path to redirect to, or a
`URLParser` object to which the transaction should
be delegated.
For example, if the URL is ``/a/b/c``, and we've already
parsed ``/a`` and are looking for ``b/c``, and we fine
`urlRedirect`` in a.__init__, then we'll look for a key
``b`` in the dictionary. The value will be a directory
we should continue to (say, ``/destpath/``). We'll
then look for ``c`` in ``destpath``.
If a key '' (empty string) is in the dictionary, then
if no more specific key is found all requests will
be redirected to that path.
Instead of a string giving a path to redirect to, you
can also give a URLParser object, so that some portions
of the path are delegated to different parsers.
If no matching key is found, and there is no '' key,
then parsing goes on as usual.
`SubParser`:
This should be a class object. It will be instantiated,
and then `parse` will be called with it, delegating to
this instance. When instantiated, it will be passed
*this* FileParser instance; the parser can use this to
return control back to the FileParser after doing whatever
it wants to do.
You may want to use a line like this to handle the names::
from ParserX import ParserX as SubParser
`urlParser`:
This should be an already instantiated URLParser-like
object. `parse(trans, requestPath)` will be called
on this instance.
`urlParserHook`:
Like `urlParser`, except the method
`parseHook(trans, requestPath, fileParser)` will
be called, where fileParser is this FileParser instance.
`urlJoins`:
Either a single path, or a list of paths. You can also
use URLParser objects, like with `urlRedirect`.
Each of these paths (or parsers) will be tried in
order. If it raises HTTPNotFound, then the next path
will be tried, ending with the current path.
Paths are relative to the current directory. If you
don't want the current directory to be a last resort,
you can include '.' in the joins.
"""
if self._initModule is None:
self._initModule = self.initModule()
mod = self._initModule
seen = trans._fileParserInitSeen.setdefault(self._path, {})
if not seen.has_key('urlTransactionHook') \
and hasattr(mod, 'urlTransactionHook'):
seen['urlTransactionHook'] = 1
mod.urlTransactionHook(trans)
if not seen.has_key('urlRedirect') \
and hasattr(mod, 'urlRedirect'):
# @@: do we need this shortcircuit?
seen['urlRedirect'] = 1
try:
nextPart, restPath = requestPath[1:].split('/', 1)
restPath = '/' + restPath
except ValueError:
nextPart = requestPath[1:]
restPath = ''
if mod.urlRedirect.has_key(nextPart):
redirTo = mod.urlRedirect[nextPart]
redirPath = restPath
elif mod.urlRedirect.has_key(''):
redirTo = mod.urlRedirect['']
redirPath = restPath
else:
redirTo = None
if redirTo:
if type(redirTo) is type(""):
fpp = FileParser(os.path.join(self._path, redirTo))
else:
fpp = redirTo
return fpp.parse(trans, redirPath)
if not seen.has_key('SubParser') \
and hasattr(mod, 'SubParser'):
seen['SubParser'] = 1
pp = mod.SubParser(self)
return pp.parse(trans, requestPath)
if not seen.has_key('urlParser') \
and hasattr(mod, 'urlParser'):
seen['urlParser'] = 1
pp = mod.urlParser
return pp.parse(trans, requestPath)
if not seen.has_key('urlParserHook') \
and hasattr(mod, 'urlParserHook'):
seen['urlParserHook'] = 1
pp = mod.urlParserHook
return pp.parseHook(trans, requestPath, self)
if not seen.has_key('urlJoins') \
and hasattr(mod, 'urlJoins'):
seen['urlJoins'] = 1
joinPath = mod.urlJoins
if type(joinPath) is type(""):
joinPath = [joinPath]
for path in joinPath:
path = os.path.join(self._path, path)
if type(path) is type(""):
parser = FileParser(os.path.join(self._path, path))
else:
parser = path
try:
return parser.parse(trans, requestPath)
except HTTPNotFound:
pass
return None
FileParser = ParamFactory(_FileParser)
class URLParameterParser(URLParser):
"""Strips named parameters out of the URL.
E.g. in ``/path/SID=123/etc`` the ``SID=123`` will be removed from the URL,
and a field will be set in the request (so long as no field by that name
already exists -- if a field does exist the variable is thrown away).
These are put in the place of GET or POST variables.
It should be put in an __init__, like::
from WebKit.URLParser import URLParameterParser
urlParserHook = URLParameterParser()
Or (slightly less efficient):
from WebKit.URLParser import URLParameterParser as SubParser
"""
## Init ##
def __init__(self, fileParser=None):
self._fileParser = fileParser
## Parsing ##
def parse(self, trans, requestPath):
"""Delegates to `parseHook`."""
return self.parseHook(trans, requestPath, self._fileParser)
def parseHook(self, trans, requestPath, hook):
"""Munges the path.
The `hook` is the FileParser object that originally called this --
we just want to strip stuff out of the URL and then give it back to
the FileParser instance, which can actually find the servlet.
"""
parts = requestPath.split('/')
result = []
req = trans.request()
for part in parts:
if part.find('=') != -1:
name, value = part.split('=', 1)
if not req.hasField(name):
req.setField(name, value)
else:
result.append(part)
return hook.parse(trans, '/'.join(result))
class ServletFactoryManagerClass:
"""Manage servlet factories.
This singleton (called `ServletFactoryManager`) collects and manages
all the servlet factories that are installed.
See `addServletFactory` for adding new factories, and `servletForFile`
for getting the factories back.
"""
## Init ##
def __init__(self):
self.reset()
def reset(self):
self._factories = []
self._factoryExtensions = {}
## Manager ##
def addServletFactory(self, factory):
"""Add a new servlet factory.
Servlet factories can add themselves with::
ServletFactoryManager.addServletFactory(factory)
The factory must have an `extensions` method, which should
return a list of extensions that the factory handles (like
``['.ht']``). The special extension ``.*`` will match any
file if no other factory is found. See `ServletFactory`
for more information.
"""
self._factories.append(factory)
for ext in factory.extensions():
assert not self._factoryExtensions.has_key(ext), \
"Extension %s for factory %s was already used by factory %s" \
% (repr(ext), factory.__name__,
self._factoryExtensions[ext].__name__)
self._factoryExtensions[ext] = factory
def factoryForFile(self, path):
"""Get a factory for a filename."""
ext = os.path.splitext(path)[1]
if self._factoryExtensions.has_key(ext):
return self._factoryExtensions[ext]
if self._factoryExtensions.has_key('.*'):
return self._factoryExtensions['.*']
raise HTTPNotFound
def servletForFile(self, trans, path):
"""Get a servlet for a filename and transaction.
Uses `factoryForFile` to find the factory, which
creates the servlet.
"""
factory = self.factoryForFile(path)
return factory.servletForTransaction(trans)
ServletFactoryManager = ServletFactoryManagerClass()
## Global Init ##
def initApp(app):
"""Initialize the application.
Installs the proper servlet factories, and gets some settings from
Application.config. Also saves the application in _globalApplication
for future calls to the application() function.
This needs to be called before any of the URLParser-derived classes
are instantiated.
"""
global _globalApplication
_globalApplication = app
from UnknownFileTypeServlet import UnknownFileTypeServletFactory
from ServletFactory import PythonServletFactory
ServletFactoryManager.reset()
for factory in [UnknownFileTypeServletFactory, PythonServletFactory]:
ServletFactoryManager.addServletFactory(factory(app))
initParser(app)
def initParser(app):
"""Initialize the FileParser Class."""
cls = _FileParser
cls._app = app
cls._imp = app._imp
cls._contexts = app.contexts
cls._filesToHideRegexes = []
cls._filesToServeRegexes = []
from fnmatch import translate
for pattern in app.setting('FilesToHide'):
cls._filesToHideRegexes.append(re.compile(fnTranslate(pattern)))
for pattern in app.setting('FilesToServe'):
cls._filesToServeRegexes.append(re.compile(fnTranslate(pattern)))
cls._toIgnore = app.setting('ExtensionsToIgnore')
cls._toServe = app.setting('ExtensionsToServe')
cls._useCascading = app.setting('UseCascadingExtensions')
cls._cascadeOrder = app.setting('ExtensionCascadeOrder')
cls._directoryFile = app.setting('DirectoryFile')
cls._extraPathInfo = app.setting('ExtraPathInfo')
|