# $Id: lexer.py 2133 2006-09-06 18:52:56Z dairiki $
# lexer.py - template parsing routines for Myghty
# Copyright (C) 2004, 2005 Michael Bayer mike_mp@zzzcomputing.com
# Original Perl code and documentation copyright (c) 1998-2003 by Jonathan Swartz.
#
# This module is part of Myghty and is released under
# the MIT License: http://www.opensource.org/licenses/mit-license.php
#
import string, re, sys, codecs
from myghty import exception
from myghty.util import *
"""initial parser for a Myghty file, locates tokens and fires events in a Compiler object.
Lexer is currently stateful and is not thread safe. the clone() method can be used
to create copies of this object for use in multiple threads.
"""
# map of block names pointing to parse method names
BLOCKS = {
'args' : 'variable_list_block',
'attr' : 'key_value_block',
'flags' : 'key_value_block',
'cleanup' : 'raw_block',
'doc' : 'doc_block',
'filter' : 'raw_block',
'init' : 'raw_block',
'once' : 'raw_block',
'global' : 'synonym:once',
'threadonce' : 'raw_block',
'threadlocal': 'synonym:threadonce',
'python' : 'raw_block',
'shared' : 'raw_block',
'requestlocal' : 'synonym:shared',
'requestonce' : 'synonym:shared',
'text' : 'text_block',
}
PYTHON_SCOPES = {
'component': 'python',
'request': 'shared',
'thread': 'threadonce',
'global': 'once',
'init' : 'init',
'cleanup': 'cleanup'
}
class Lexer:
def __init__(self, **params):
self.current = None
def get_object_id(self):
"""returns an ID that can identify this lexer"""
return "Myghty.Lexer"
def clone(self, **params):
"""creates a clone of this Lexer. allow the Prototype pattern
to be used in creating lexers for use in other threads."""
return Lexer(**params)
class LexContext:
"""an object tracking the lexer's progress through a component block."""
def __init__(self, source, name, compiler):
self.source = source
self.name = name
self.compiler = compiler
# a regular expression to match the "end" of whatever construct
# the parser located
# different methods override this to locate different kinds of
# endings
# This will be overridden if entering a def or method section.
self.ending = re.compile(r'\Z', re.S)
# place to begin regular expression matching
# since I cannot find an equivalent of perl's \G in python
self.match_position = 0
self.in_def = False
self.in_method = False
self.block_name = None
self.block_type = None
self.lines = 0
def set_in_named_block(self, block_type, name):
if block_type == 'def':
self.in_def = True
elif block_type == 'method':
self.in_method = True
elif block_type == 'closure':
pass
else:
raise "invalid block type %s" % block_type
self.block_name = name
self.block_type = block_type
def reset_in_named_block(self):
self.in_def = False
self.in_method = False
self.block_name = None
def match_pos(self, regstring = None, flags = None, regexp = None):
if regexp == None:
if flags:
regexp = re.compile(regstring, flags)
else:
regexp = re.compile(regstring)
match = regexp.match(self.source, self.match_position )
if match:
(start, end) = match.span()
# attempt to simulate perl's \G operator. usually works, except
# it behaves differently with zero-length matches.
# well actually perl's operator behaves more strangely.
# see def variable_list_block for further \G angst
if end == start:
self.match_position = end + 1
else:
self.match_position = end
return match
def lex(self, source, name, compiler, input_file = None):
# Holds information about the current lex.
current = Lexer.LexContext(source, name, compiler)
# set current lex to this one
self.current = current
# optional full path of the file the source came from; passed through
# to SyntaxErrors for exception reporting
self.input_file = input_file
# Clean up Mac and DOS line endings
current.source = re.sub(r'\r\n?', "\n", current.source)
# Detect and remove leading UTF-8 byte-order-marker
# Some windows editors add these at the beginning of a file to
# mark their content as UTF-8.
if current.source.startswith(codecs.BOM_UTF8):
current.source = current.source[len(codecs.BOM_UTF8):]
self.current.compiler.magic_encoding_comment('utf_8')
try:
try:
current.compiler.start_component()
self.start()
except Exception, e:
raise
finally:
current.compiler.end_component()
def start(self):
end = None
length = len(self.current.source)
while (True):
if self.current.match_position > length: break
end = self.match_end()
if end: break
if self.match_block(): continue
if self.match_named_block():continue
if self.match_substitute(): continue
if self.match_comp_call(): continue
if self.match_python_line(): continue
if self.match_comp_content_call(): continue
if self.match_comp_content_call_end(): continue
if self.match_text(): continue
isend = (self.current.match_position > len(self.current.source))
if (self.current.in_def or self.current.in_method) and isend:
self.raise_syntax_error("Missing closing </%%%s> tag" % self.current.block_type)
if isend: break
raise exception.Compiler("Infinite parsing loop encountered - Lexer bug?")
if self.current.in_def or self.current.in_method:
type = self.current.block_type
if not isinstance(end, str) or not self.current.ending.match(end):
block_name = self.current.block_name
self.raise_syntax_error("no closing </%%%s> tag for <%%%s %s> block" % (type, type, block_name))
def match_block(self):
match = self.current.match_pos(regexp = re.compile(r'\<%(' + string.join(BLOCKS.keys(), '|') + r')(\s+[^>]*)?\s*>', re.I | re.S ))
if match:
(type, attr) = (match.group(1).lower(), match.group(2))
self.current.block_type = type
attributes = {}
if attr:
attrmatch = re.findall(r"\s*((\w+)\s*=\s*('[^']*'|\"[^\"]*\"|\w+))\s*", attr)
for att in attrmatch:
(full, key, val) = att
try:
attributes[key] = eval(val)
except:
(e, msg) = sys.exc_info()[0:2]
self.raise_syntax_error("Non-evaluable attribute value: '%s' (%s: %s)" % (val, e, msg))
syntype = None
# get method name for this block
try:
method = BLOCKS[type]
if string.find(method, ':') != -1:
syntype = method.split(':', 1)[-1]
method = BLOCKS[syntype]
except KeyError:
self.raise_syntax_error("no such block type '%s'" % type)
if attributes.has_key('scope') and type == 'python' or syntype == 'python':
try:
syntype = PYTHON_SCOPES[attributes['scope']]
except KeyError:
self.raise_syntax_error("unknown python scope '%s'" % attributes['scope'])
if syntype:
self.current.compiler.start_block(block_type = syntype, attributes = attributes)
else:
self.current.compiler.start_block(block_type = type, attributes = attributes)
# call method dynamically
getattr(self, method)(block_type = type, synonym_for = syntype, attributes = attributes)
self.current.block_type = None
return True
else: return False
def match_named_block(self):
match = self.current.match_pos(regexp = re.compile(r"<%(def|method|closure)(?:\s+([^\n]+?))?(\s+[^>]*)?\s*>", re.I | re.S))
if match:
(type, name, attr) = (match.group(1).lower(), match.group(2), match.group(3))
attributes = {}
if attr:
attrmatch = re.findall(r"\s*((\w+)\s*=\s*('[^']*'|\"[^\"]*\"|\w+))\s*", attr)
for att in attrmatch:
(full, key, val) = att
attributes[key] = val
if not type or not name:
self.raise_syntax_error("%s block without a name" % type)
self.current.compiler.start_named_block(block_type = type, name = name, attributes = attributes)
# preserve a little state
existingending = self.current.ending
# screw with the current compile context
self.current.ending = re.compile(r"<\/%%%s>(\n?)" % type, re.I)
self.current.set_in_named_block(block_type = type, name = name)
# recursively call the start() stuff
self.start()
# tell compiler to close up the block
self.current.compiler.end_named_block(block_type = type)
# restore the state of the current compile
self.current.ending = existingending
self.current.reset_in_named_block()
# give our caller the good news
return True
else:
return False
def match_text(self):
current = self.current
match = current.match_pos(regexp = re.compile(r"""
(.*?) # anything, followed by:
(
(?<=\n)(?=[%#]) # an eval or comment line, preceded by a consumed \n
|
(?=</?[%&]) # a substitution or block or call start or end
# - don't consume
|
(\\\n) # an escaped newline - throw away
|
\Z # end of string
)""", re.X | re.S))
if match:
text = match.group(1)
current.compiler.text_block(block = text)
current.lines += self._count_lines(text)
if match.group(3):
current.lines += 1
return True
else:
return False
def match_substitute(self):
# This routine relies on there *not* to be an opening <%foo> tag
# present, so match_block() must happen first.
if not self.current.match_pos(r"<%"):
return False
match = self.current.match_pos(
regexp = re.compile("""
(.+?) # Substitution body ($1)
(
\s*
(?<!\|) # Not preceded by a '|'
\| # A '|'
\s*
( # (Start $3)
[^\W\d]\w* # A flag
(?:\s*,\s*[^\W\d]\w*)* # More flags, with comma separators
)
\s*
)?
%> # Closing tag
""", re.X | re.I | re.S))
if match:
(body, extra, escape) = match.group(1, 2, 3)
self.current.lines += self._count_lines(body)
if extra:
self.current.lines += self._count_lines(extra)
self.current.compiler.substitution(body, escape)
return True
else:
self.raise_syntax_error("'<%' without matching '%>'")
def match_comp_call(self):
match = self.current.match_pos(regexp = re.compile(r"<&(?!\|)", re.S))
if match:
match = self.current.match_pos(regexp = re.compile(r"(.*?)&>", re.S))
if match:
call = match.group(1)
self.current.compiler.component_call(call)
self.current.lines += self._count_lines(call)
return True
else:
self.raise_syntax_error("'<&' without matching '&>'")
else:
return False
def match_comp_content_call(self):
match = self.current.match_pos(regexp = re.compile(r"<&\|", re.S))
if match:
match = self.current.match_pos(regexp = re.compile(r"(.*?)&>", re.S))
if match:
call = match.group(1)
self.current.compiler.component_content_call(call)
self.current.lines += self._count_lines(call)
return True
else:
self.raise_syntax_error("'<&|' without matching '&>'")
else:
return False
def match_comp_content_call_end(self):
match = self.current.match_pos(r"</&>")
if match:
self.current.compiler.component_content_call_end()
return True
else:
return False
def match_block_end(self, block_type, allow_text = True, **params):
if allow_text:
regex = re.compile(r"(.*?)</%%%s>(\n?)" % block_type, re.I | re.S)
else:
regex = re.compile(r"\s*</%%%s>(\n?)" % block_type, re.I | re.S)
match = self.current.match_pos(regex)
if match:
if allow_text:
return tuple(match.group(1,2))
else:
return match.group(1)
else:
self.raise_syntax_error("Invalid <%%%s> section line" % block_type)
def match_python_line(self):
match = self.current.match_pos(r"(?<=^)([%#])([^\n]*)(?:\n|\Z)", re.M)
if match:
# comment
if match.group(1) == '#':
if self.current.lines < 2:
# Magic -*- encoding: foo -*- comment
m = re.search(r'coding[=:]\s*([-\w.]+)', match.group(2))
if m:
self.current.compiler.magic_encoding_comment(m.group(1))
self.current.lines += 1
return True
self.current.compiler.python_line(line = match.group(2))
self.current.lines += 1
return True
else:
return False
def match_end(self):
match = self.current.match_pos(regexp = self.current.ending)
if match:
string = match.group()
self.current.lines += self._count_lines(string)
if string:
return string
else:
return True
else:
return False
def variable_list_block(self, block_type, attributes = None, **params):
# python doesnt quite do the regexp here the same way as perl (which seems to
# do it, incorrectly ??? somehow perl magically knows to stop global matching beyond
# the </%args> line based on the (?= </%args> ) match at the end. python doesnt.
# or maybe i just goofed.).
# anyway, just to get this to work, get the whole ARG block out of the source first,
# then operate upon that. if theres some all-in-one way
# to do it in python, or i goofed, be my guest.
match = self.current.match_pos(regexp = re.compile(r""".*?(?= <\/%%%s> )""" % block_type, re.M | re.S | re.X))
if match:
source = match.group()
else:
source = ''
# operate upon the stuff inside of <%block></%block>
regexp = re.compile(r"""
(?:
(?:
[ \t]*
( [^\W\d]\w* ) #only allows valid Python variable names
[ \t]*
(?:
(?: # begin optional part of arg
=
( [^\n]+ ) # default value, also consumes an inline comment, if any
)
|
(?: # an optional comment after an arg without a default
[ \t]*
\#
[^\n]*
)
)?
)
|
[ \t]* # a comment line
\#
[^\n]*
|
[ \t]* # just space
)
(\n?) # optional newline. the ? makes finditer() go into an endless loop.
""" , re.VERBOSE | re.I | re.M)
# finditer has a bug here. goes into an endless loop.
# but findall works. if i take the ? off the last newline there, then
# finditer works, but we lose the args if it looks like <%args>foo</%args>
# with no newline. *shrug*
matches = regexp.findall(source)
#matches = regexp.finditer(source)
scope = None
if attributes is not None and attributes.has_key('scope'):
scope = attributes['scope']
for match in matches:
(name, default, linebr) = match
#(name, default, linebr) = match.group(1, 2, 3)
if name:
self.current.compiler.variable_declaration(block_type=block_type,
name=name,
default=default,
scope = scope)
if linebr:
self.current.lines += 1
params['allow_text'] = False
nl = self.match_block_end(block_type = block_type, **params)
if nl:
self.current.lines +=1
self.current.compiler.end_block(block_type = block_type)
def key_value_block(self, block_type, **params):
# do this like the variable_list_block
# see that method for regexp quirks
match = self.current.match_pos(regexp = re.compile(r""".*?(?= <\/%%%s> )""" % block_type, re.M | re.S | re.X))
if match:
source = match.group()
else:
source = ''
regexp = re.compile(r"""
(?:
[ \t]*
([\w_]+) # identifier
[ \t]*[=:][ \t]* # separator
(\S[^\n]*) # value ( must start with a non-space char)
|
[ \t]* # an optional comment
\#
[^\n]*
|
[ \t]* # just space
)
(\n?)
""" , re.VERBOSE | re.I)
matches = regexp.findall(source)
#matches = regexp.finditer(source)
for match in matches:
(key, value, newline) = match
#(key, value) = match.group(1, 2)
if key:
self.current.compiler.key_value_pair(block_type = block_type,
key = key, value = value)
if newline:
self.current.lines += 1
params['allow_text'] = False
nl = self.match_block_end(block_type = block_type, **params)
if nl:
self.current.lines +=1
self.current.compiler.end_block(block_type = block_type)
def generic_block(self, method, **params):
params['allow_text'] = True
(block, n1) = self.match_block_end(**params)
if params.has_key('synonym_for') and params['synonym_for'] is not None:
compiler_block_type = params['synonym_for']
else:
compiler_block_type = params['block_type']
getattr(self.current.compiler, method)(block_type = compiler_block_type, block = block)
self.current.lines += self._count_lines(block)
if n1:
self.current.lines +=1
self.current.compiler.end_block(block_type = compiler_block_type)
def text_block(self, **params):
self.generic_block('text_block', **params)
def raw_block(self, **params):
self.generic_block('raw_block', **params)
def doc_block(self, **params):
self.generic_block('doc_block', **params)
def line_number(self):
return self.current.lines + 1
def get_name(self):
return self.current.name
def _count_lines(self, text):
return len(re.findall(r"\n", text))
def _current_line(self):
lines = re.split(r"\n",self.current.source[0:self.current.match_position])
if len(lines) <= self.current.lines:
return ''
else:
return lines[self.current.lines]
def raise_syntax_error(self, error):
raise exception.Syntax(
error = error,
comp_name = self.get_name(),
source_line = self._current_line(),
line_number = self.line_number(),
source = self.current.source,
file = self.input_file,
source_encoding = self.current.compiler.get_encoding())
|