"""Object-oriented tag-table generator objects
The objectgenerator module is the core of the SimpleParse
system, the various element token classes defined here
implement transitions from EBNF-style abstractions into
the low-level (assembly-like) instructions to the
TextTools engine.
Each class within the module is a sub-class of ElementToken,
which provides a number of common facilities, the most
obvious of which is the permute method, which takes care of
the negative, optional, and repeating flags for the normal
case (with character ranges and literals being non-normal).
"""
from simpleparse.stt.TextTools.TextTools import *
### Direct use of BMS is deprecated now...
try:
TextSearch
except NameError:
TextSearch = BMS
from simpleparse.error import ParserSyntaxError
import copy
class ElementToken:
"""Abstract base class for all ElementTokens
Common Attributes:
negative -- the element token should match
a character if the "base" definition
would not match at the current position
optional -- the element token will match even
if the base definition would not match
at the current position
repeating -- if the element is successfully
matched, attempt to match it again.
lookahead -- if true, the scanning position
of the engine will be reset after the
element matches
errorOnFail -- if true, the engine will call the
object stored in errorOnFail as a text-
matching object iff the element token fails
to match. This is used to signal
SyntaxErrors.
Attributes only used for top-level Productions:
report -- if true, the production's results
will be added to the result tree
expanded -- if true, the production's children's
results will be added to the result tree
but the production's own result will be ignored
"""
negative = 0
optional = 0
repeating = 0
report = 1
# note that optional and errorOnFail are mutually exclusive
errorOnFail = None
# any item may be marked as expanded,
# which says that it's a top-level declaration
# and that links to it should automatically expand
# as if the name wasn't present...
expanded = 0
lookahead = 0
def __init__( self, **namedarguments ):
"""Initialize the object with named attributes
This method simply takes the named attributes and
updates the object's dictionary with them
"""
self.__dict__.update( namedarguments )
def toParser( self, generator, noReport=0 ):
"""Abstract interface for implementing the conversion to a text-tools table
generator -- an instance of generator.Generator
which provides various facilities for discovering
other productions.
noReport -- if true, we're being called recursively
for a terminal grammar fragment where one of our
parents has explicitly suppressed all reporting.
This method is called by the generator or by
another element-token's toParser method.
"""
raise NotImplementedError( '''Element token generator abstract function called''' )
def permute( self, basetable ):
'''Given a positive, required, non-repeating table, convert to appropriately configured table
This method applies generic logic for applying the
operational flags to a basic recipe for an element.
It is normally called from theelementstokensown import
toParser method.
'''
flags = 0
if self.lookahead:
flags = flags + LookAhead
assert len(basetable) == 3, '''Attempt to permute a base table that already has fail flag set, can only permute unadorned tables'''
if self.negative:
# negative "matches" if it fails
# we add in the flags while we're at it...
basetable = (None, SubTable+flags, (
basetable + (1,2),
(None, EOF, Here,2,1), # if we hit eof, this didn't match, otherwise, we matched
(None, Fail, Here),# either hit eof or matched the client
(None,Skip,1),
))
elif flags:
# unpack, add the flags, and repack
tag, command, arg = basetable
basetable = ( tag, command+flags, arg)
if self.repeating:
### There are a number of problems with repetition that we'd like to solve
### via recursive table calls, but those are very expensive in the current
### implementation, so we need to use something a little more hacky...
if self.optional:
return [
## this would be the "simplistic" implementation...
## basetable + (1,0)
## it doesn't work because of cases
## where all-optional children "succeed" without consuming
## when within a repeating parent
## the EOF test isn't enough to fix the problem,
## as it's only checking a common case, not the underlying failure
basetable +(2,1), # fail, done, succeed, check for eof and if not, try matching again
# if we hit eof, no chance of further matches,
# consider ourselves done
(None, EOF, Here,-1,1),
]
elif self.errorOnFail:
return [
basetable+(1,2),
(None, Call, self.errorOnFail),
# as for optional...
basetable +(2,1),
(None, EOF, Here,-1,1),
]
else:
return [
basetable,
# as for optional...
basetable +(2,1),
(None, EOF, Here,-1,1),
]
else: # single
if self.optional:
return [
basetable +(1,1)
]
elif self.errorOnFail:
return [
basetable+(1,2),
(None, Call, self.errorOnFail),
]
else: # not optional
return [
basetable
]
def __repr__( self):
"""Return a readily recognisable version of ourself"""
from simpleparse import printers
return printers.asObject( self )
def terminal (self, generator):
"""Determine if this element is terminal for the generator"""
return 0
class Literal( ElementToken ):
"""Literal string value to be matched
Literals are one of the most common elements within
any grammar. The implementation tries to use the
most efficient mechanism available for matching/searching
for a literal value, so the Literal class does not
use the permute method, instead defining explicit
parsing methodologies for each flag and value combination
Literals in the SimpleParse EBNF grammar are defined like so:
"test", "test"?, "test"*, "test"+
-"test", -"test"?, -"test"*, -"test"+
Attributes:
value -- a string storing the literal's value
Notes:
Currently we don't support Unicode literals
See also:
CILiteral -- case-insensitive Literal values
"""
value = ""
def toParser( self, generator=None, noReport=0 ):
"""Create the parser for the element token"""
flags = 0
if self.lookahead:
flags = flags + LookAhead
base = self.baseToParser( generator )
if flags or self.errorOnFail:
if self.errorOnFail:
return [(None, SubTable+flags, tuple(base),1,2),(None, Call, self.errorOnFail)]
else:
return [(None, SubTable+flags, tuple(base))]
else:
return base
def baseToParser( self, generator=None ):
"""Parser generation without considering flag settings"""
svalue = self.value
if self.negative:
if self.repeating: # a repeating negative value, a "search" in effect
if self.optional: # if fails, then go to end of file
return [ (None, sWordStart, TextSearch( svalue ),1,2), (None, Move, ToEOF ) ]
else: # must first check to make sure the current position is not the word, then the same
return [
(None, Word, svalue, 2,1),
(None, Fail, Here),
(None, sWordStart, TextSearch( svalue ),1,2),
(None, Move, ToEOF )
]
#return [ (None, Word, svalue, 2,1),(None, Fail, Here),(None, WordStart, svalue,1,2), (None, Move, ToEOF ) ]
else: # a single-character test saying "not a this"
if self.optional: # test for a success, move back if success, move one forward if failure
if len(svalue) > 1:
return [ (None, Word, svalue, 2,1),
(None, Skip, -len(svalue), 2,2), # backup if this was the word to start of word, succeed
(None, Skip, 1 ) ] # else just move one character and succeed
else: # Uses Is test instead of Word test, should be faster I'd imagine
return [ (None, Is, svalue, 2,1),
(None, Skip, -1, 2,2), # backtrack
(None, Skip, 1 ) ] # else just move one character and succeed
else: # must find at least one character not part of the word, so
if len(svalue) > 1:
return [ (None, Word, svalue, 2,1),
(None, Fail, Here),
(None, Skip, 1 ) ] # else just move one character and succeed
else: #must fail if it finds or move one forward
return [ (None, Is, svalue, 2,1),
(None, Fail, Here),
(None, Skip, 1 ) ] # else just move one character and succeed
else: # positive
if self.repeating:
if self.optional:
if len(svalue) > 1:
return [ (None, Word, svalue, 1,0) ]
else:
return [ (None, Is, svalue, 1,0) ]
else: # not optional
if len(svalue) > 1:
return [ (None, Word, svalue),(None, Word, svalue,1,0) ]
else:
return [ (None, Is, svalue),(None, Is, svalue,1,0) ]
else: # not repeating
if self.optional:
if len(svalue) > 1:
return [ (None, Word, svalue, 1,1) ]
else:
return [ (None, Is, svalue, 1,1) ]
else: # not optional
if len(svalue) > 1:
return [ (None, Word, svalue) ]
else:
return [ (None, Word, svalue) ]
def terminal (self, generator):
"""Determine if this element is terminal for the generator"""
return 1
class _Range( ElementToken ):
"""Range of character values where any one of the characters may match
The Range token allows you to define a set of characters
(using a mini-grammar) of which any one may match. By using
the repetition flags, it is possible to easily create such
common structures as "names" and "numbers". For example:
name := [a-zA-Z]+
number := [0-9.eE]+
(Note: those are not beautifully defined examples :) ).
The mini-grammar for the simpleparsegrammar is defined as follows:
'[',CHARBRACE?,CHARDASH?, (CHARRANGE/CHARNOBRACE)*, CHARDASH?,']'
that is, if a literal ']' character is wanted, you must
define the character as the first item in the range. A literal
'-' character must appear as the first character after any
literal ']' character (or the beginning of the range) or as the
last character in the range.
Note: The expansion from the mini-grammar occurs before the
Range token is created (the simpleparse grammar does the
expansion), so the value attribute of the token is actually
the expanded string of characters.
"""
value = ""
requiresExpandedSet = 1
def toParser( self, generator=None, noReport=0 ):
"""Create the parser for the element token"""
flags = 0
if self.lookahead:
flags = flags + LookAhead
base = self.baseToParser( generator )
if flags or self.errorOnFail:
if self.errorOnFail:
return [(None, SubTable+flags, tuple(base),1,2),(None, Call, self.errorOnFail)]
else:
return [(None, SubTable+flags, tuple(base))]
else:
return base
# this should be a faster and more generic character set
# approach, but there's a bug with mxTextTools b3 which makes
# it non-functional, so for now I'm using the old version.
# Eventually this should also support the Unicode character sets
##try:
## CharSet
## class Range( _Range ):
## """Range type using the CharSet feature of mx.TextTools 2.1.0
##
## The CharSet type allows for both Unicode and 256-char strings,
## so we can use it as our 2.1.0 primary parsing mechanism.
## It also allows for simpler definitions (doesn't require that
## we pre-exand the character set). That's going to require support
## in the SimpleParse grammar, of course.
## """
## requiresExpandedSet = 0
## def baseToParser( self, generator=None ):
## """Parser generation without considering flag settings"""
## svalue = self.value
## print 'generating range for ', repr(svalue)
## if not svalue:
## raise ValueError( '''Range defined with no member values, would cause infinite loop %s'''%(self))
## if self.negative:
## svalue = '^' + svalue
## print ' generated', repr(svalue)
## svalue = CharSet(svalue)
## if self.repeating:
## if self.optional:
## return [ (None, AllInCharSet, svalue, 1 ) ]
## else: # not optional
## #return [ (None, AllInSet, svalue ) ]
## return [ (None, AllInCharSet, svalue ) ]
## else: # not repeating
## if self.optional:
## #return [ (None, IsInSet, svalue, 1 ) ]
## return [ (None, IsInCharSet, svalue, 1 ) ]
## else: # not optional
## #return [ (None, IsInSet, svalue ) ]
## return [ (None, IsInCharSet, svalue ) ]
##except NameError:
class Range( _Range ):
"""Range type which doesn't use the CharSet features in mx.TextTools
This is likely to be much slower than the CharSet version (below), and
is unable to handle unicode character sets. However, it will work with
TextTools 2.0.3, which may be needed in some cases.
"""
def baseToParser( self, generator=None ):
"""Parser generation without considering flag settings"""
svalue = self.value
if not svalue:
raise ValueError( '''Range defined with no member values, would cause infinite loop %s'''%(self))
if self.negative:
if self.repeating:
if self.optional:
#return [ (None, AllInSet, svalue, 1 ) ]
return [ (None, AllNotIn, svalue, 1 ) ]
else: # not optional
#return [ (None, AllInSet, svalue ) ]
return [ (None, AllNotIn, svalue ) ]
else: # not repeating
if self.optional:
#return [ (None, IsInSet, svalue, 1 ) ]
return [ (None, IsNotIn, svalue, 1 ) ]
else: # not optional
#return [ (None, IsInSet, svalue ) ]
return [ (None, IsNotIn, svalue ) ]
else:
if self.repeating:
if self.optional:
#return [ (None, AllInSet, svalue, 1 ) ]
return [ (None, AllIn, svalue, 1 ) ]
else: # not optional
#return [ (None, AllInSet, svalue ) ]
return [ (None, AllIn, svalue ) ]
else: # not repeating
if self.optional:
#return [ (None, IsInSet, svalue, 1 ) ]
return [ (None, IsIn, svalue, 1 ) ]
else: # not optional
#return [ (None, IsInSet, svalue ) ]
return [ (None, IsIn, svalue ) ]
def terminal (self, generator):
"""Determine if this element is terminal for the generator"""
return 1
class Group( ElementToken ):
"""Abstract base class for all group element tokens
The primary feature of a group is that it has a set
of element tokens stored in the attribute "children".
"""
children = ()
terminalValue = None
def terminal (self, generator):
"""Determine if this element is terminal for the generator"""
if self.terminalValue in (0,1):
return self.terminalValue
self.terminalValue = 0
for item in self.children:
if not item.terminal( generator):
return self.terminalValue
self.terminalValue = 1
return self.terminalValue
class SequentialGroup( Group ):
"""A sequence of element tokens which must match in a particular order
A sequential group must match each child in turn
and all children must be satisfied to consider the
group matched.
Within the simpleparsegrammar, the sequential group
is defined like so:
("a", b, c, "d")
i.e. a series of comma-separated element token definitions.
"""
def toParser( self, generator=None, noReport=0 ):
elset = []
for child in self.children:
elset.extend( child.toParser( generator, noReport ) )
basic = self.permute( (None, SubTable, tuple( elset)) )
if len(basic) == 1:
first = basic[0]
if len(first) == 3 and first[0] is None and first[1] == SubTable:
return tuple(first[2])
return basic
class CILiteral( SequentialGroup ):
"""Case-insensitive Literal values
The CILiteral is a sequence of literal and
character-range values, where each element is
positive and required. Literal values are
composed of those characters which are not
upper-case/lower-case pairs, while the ranges
are all two-character ranges with the upper
and lower forms.
CILiterals in the SimpleParse EBNF grammar are defined like so:
c"test", c"test"?, c"test"*, c"test"+
-c"test", -c"test"?, -c"test"*, -c"test"+
Attributes:
value -- a string storing the literal's value
Notes:
Currently we don't support Unicode literals
A CILiteral will be *much* slower than a
regular literal or character range
"""
value = ""
def toParser( self, generator=None, noReport=0 ):
elset = self.ciParse( self.value )
if len(elset) == 1:
# XXX should be compressing these out during optimisation...
# pointless declaration of case-insensitivity,
# or a single-character value
pass
basic = self.permute( (None, SubTable, tuple( elset)) )
if len(basic) == 1:
first = basic[0]
if len(first) == 3 and first[0] is None and first[1] == SubTable:
return tuple(first[2])
return basic
def ciParse( self, value ):
"""Break value into set of case-dependent groups..."""
def equalPrefix( a,b ):
for x in range(len(a)-1):
if a[x] != b[x]:
return x
result = []
a,b = value.upper(), value.lower()
while a and b:
# is there an equal literal run at the start?
stringPrefix = equalPrefix( a,b )
if stringPrefix:
result.append( (None, Word, a[:stringPrefix]) )
a,b = a[stringPrefix:],b[stringPrefix:]
# if we hit the end of the string, that's fine, just return
if not a and b:
break
# otherwise, the next character must be a case-differing pair
result.append( (None, IsIn, a[0]+b[0]) )
a,b = a[1:], b[1:]
return result
class ErrorOnFail(ElementToken):
"""When called as a matching function, raises a SyntaxError
Attributes:
expected -- list of strings describing expected productions
production -- string name of the production that's failing to parse
message -- overrides default message generation if non-null
(something,something)+!
(something,something)!
(something,something)+!"Unable to parse somethings in my production"
(something,something)!"Unable to parse somethings in my production"
if string -> give an explicit message (with optional % values)
else -> use a default string
"""
production = ""
message = ""
expected = ""
def __call__( self, text, position, end ):
"""Method called by mxTextTools iff the base production fails"""
error = ParserSyntaxError( self.message )
error.error_message = self.message
error.production = self.production
error.expected= self.expected
error.buffer = text
error.position = position
raise error
def copy( self ):
import copy
return copy.copy( self )
class FirstOfGroup( Group ):
"""Set of tokens that matches (and stops searching) with the first successful child
A FirstOf group attempts to match each child in turn,
declaring success with the first successful child,
or failure if none of the children match.
Within the simpleparsegrammar, the FirstOf group
is defined like so:
("a" / b / c / "d")
i.e. a series of slash-separated element token definitions.
"""
def toParser( self, generator=None, noReport=0 ):
elset = []
# should catch condition where a child is optional
# and we are repeating (which causes a crash during
# parsing), but doing so is rather complex and
# requires analysis of the whole grammar.
for el in self.children:
assert not el.optional, """Optional child of a FirstOf group created, this would cause an infinite recursion in the engine, child was %s"""%el
dataset = el.toParser( generator, noReport )
if len( dataset) == 1:# and len(dataset[0]) == 3: # we can alter the jump states with impunity
elset.append( dataset[0] )
else: # for now I'm eating the inefficiency and doing an extra SubTable for all elements to allow for easy calculation of jumps within the FO group
elset.append( (None, SubTable, tuple( dataset )) )
procset = []
for i in range( len( elset) -1): # note that we have to treat last el specially
procset.append( elset[i] + (1,len(elset)-i) ) # if success, jump past end
procset.append( elset[-1] ) # will cause a failure if last element doesn't match
procset = tuple(procset)
basetable = (None, SubTable, procset )
return self.permute( basetable )
class Prebuilt( ElementToken ):
"""Holder for pre-built TextTools tag tables
You can pass in a Pre-built tag table when
creating your grammar, doing so creates
Prebuilt element tokens which can be referenced
by the other element tokens in your grammar.
"""
value = ()
def toParser( self, generator=None, noReport=0 ):
return self.value
class LibraryElement( ElementToken ):
"""Holder for a prebuilt item with it's own generator"""
generator = None
production = ""
methodSource = None
def toParser( self, generator=None, noReport=0 ):
if self.methodSource is None:
source = generator.methodSource
else:
source = self.methodSource
basetable = self.generator.buildParser( self.production, source )
try:
if type(basetable[0]) == type(()):
if len(basetable) == 1 and len(basetable[0]) == 3:
basetable = basetable[0]
else:
# this is a table that got returned!
basetable = (None, SubTable, basetable)
return self.permute( basetable )
except:
print basetable
raise
class Name( ElementToken ):
"""Reference to another rule in the grammar
The Name element token allows you to reference another
production within the grammar. There are three major
sub-categories of reference depending on both the Name
element token and the referenced table's values.
if the Name token's report attribute is false,
or the target table's report attribute is false,
or the Name token negative attribute is true,
the Name reference will report nothing in the result tree
if the target's expand attribute is true, however,
the Name reference will report the children
of the target production without reporting the
target production's results (SubTable match)
finally:
if the target is not expanded and the Name token
should report something, the generator object is
asked to supply the tag object and flags for
processing the results of the target. See the
generator.MethodSource documentation for details.
Notes:
expanded and un-reported productions won't get any
methodsource methods called when
they are finished, that's just how I decided to
do it, not sure if there's some case where you'd
want it. As a result, it's possible to have a
method getting called for one instance (where a
name ref is reporting) and not for another (where
the name ref isn't reporting).
"""
value = ""
# following two flags are new ideas in the rewrite...
report = 1
def toParser( self, generator, noReport=0 ):
"""Create the table for parsing a name-reference
Note that currently most of the "compression" optimisations
occur here.
"""
sindex = generator.getNameIndex( self.value )
command = TableInList
target = generator.getRootObjects()[sindex]
reportSelf = (
(not noReport) and # parent hasn't suppressed reporting
self.report and # we are not suppressing ourselves
target.report and # target doesn't suppress reporting
(not self.negative) and # we aren't a negation, which doesn't report anything by itself
(not target.expanded) # we don't report the expanded production
)
reportChildren = (
(not noReport) and # parent hasn't suppressed reporting
self.report and # we are not suppressing ourselves
target.report and # target doesn't suppress reporting
(not self.negative) # we aren't a negation, which doesn't report anything by itself
)
if reportSelf:
svalue = self.value
else:
svalue = None
flags = 0
if target.expanded:
# the target is the root of an expandedname declaration
# so we need to do special processing to make sure that
# it gets properly reported...
command = SubTableInList
tagobject = None
# check for indirected reference to another name...
elif not reportSelf:
tagobject = svalue
else:
flags, tagobject = generator.getObjectForName( svalue )
if flags:
command = command | flags
if tagobject is None and not flags:
if self.terminal(generator):
if extractFlags(self,reportChildren) != extractFlags(target):
composite = compositeFlags(self,target, reportChildren)
partial = generator.getCustomTerminalParser( sindex,composite)
if partial is not None:
return partial
partial = tuple(copyToNewFlags(target, composite).toParser(
generator,
not reportChildren
))
generator.cacheCustomTerminalParser( sindex,composite, partial)
return partial
else:
partial = generator.getTerminalParser( sindex )
if partial is not None:
return partial
partial = tuple(target.toParser(
generator,
not reportChildren
))
generator.setTerminalParser( sindex, partial)
return partial
# base, required, positive table...
if (
self.terminal( generator ) and
(not flags) and
isinstance(target, (SequentialGroup,Literal,Name,Range))
):
partial = generator.getTerminalParser( sindex )
if partial is None:
partial = tuple(target.toParser(
generator,
#not reportChildren
))
generator.setTerminalParser( sindex, partial)
if len(partial) == 1 and len(partial[0]) == 3 and (
partial[0][0] is None or tagobject is None
):
# there is a single child
# it doesn't report anything, or we don't
partial = (partial[0][0] or tagobject,)+ partial[0][1:]
else:
partial = (tagobject, Table, tuple(partial))
return self.permute( partial )
basetable = (
tagobject,
command, (
generator.getParserList (),
sindex,
)
)
return self.permute( basetable )
terminalValue = None
def terminal (self, generator):
"""Determine if this element is terminal for the generator"""
if self.terminalValue in (0,1):
return self.terminalValue
self.terminalValue = 0
target = generator.getRootObject( self.value )
if target.terminal( generator):
self.terminalValue = 1
return self.terminalValue
def extractFlags( item, report=1 ):
"""Extract the flags from an item as a tuple"""
return (
item.negative,
item.optional,
item.repeating,
item.errorOnFail,
item.lookahead,
item.report and report,
)
def compositeFlags( first, second, report=1 ):
"""Composite flags from two items into overall flag-set"""
result = []
for a,b in map(None, extractFlags(first, report), extractFlags(second, report)):
result.append( a or b )
return tuple(result)
def copyToNewFlags( target, flags ):
"""Copy target using combined flags"""
new = copy.copy( target )
for name,value in map(None,
("negative","optional","repeating","errorOnFail","lookahead",'report'),
flags,
):
setattr(new, name,value)
return new
|