"""A simple example of parsing
I have no idea for whom I originally created this code,
(which was originally written for SimpleParse 1.0) nor
why they wanted it. Oh well, such is life.
Running as a script will do some timing tests, but the
tests are rather... simplistic.
The grammar is slow parsing around 5-10% of the speed I
normally expect from SimpleParse/mxTextTools parsers.
I'm guessing it gets into lots and lots of partial parses
of the "interesting" production, and that the huge number
of reported productions slows it down. For example,
making atom non-reporting gives a 15% speedup on my
machine.
"""
declaration = r'''
set := (interesting/multset/plusset)+
multset := '*',(set/atom), (set/atom)
plusset := '+',(set/atom), (set/atom)
atom := -[+*]
>interesting< := (example8/example7/example6/example5/example4/example3/example2/example1)
example1 := '*+',(set/atom),(set/atom),'+',(set/atom),(set/atom)
example2 := '**',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example3 := 'fsd*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example4 := 'm*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example5 := 'a*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example6 := 's*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example7 := 'bdf*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
example8 := 'sd*',(set/atom),(set/atom),'++',(set/atom),(set/atom),(set/atom)
'''
import sys, string
from simpleparse.parser import Parser
parser = Parser(declaration,'set')
class Emitter:
def process( self, data ):
#import pprint
tree = self.parse( data )
#pprint.pprint( tree )
# wrap up the tuple 'cause TextTools uses a different format for the top-level :(
tree = ('set',0, tree[-1], tree[1] )
return self.emit( tree )
def parse( self, data ):
self.data = data
return parser.parse( data)
def write( self, data ):
sys.stdout.write( data )
def emit( self, tree ):
'''
return transformation for a single tuple...
'''
if hasattr( self, 'emit' + tree[0] ): # have explicitprocessing function
func = getattr( self, 'emit'+tree[0] )
return func( tree )
else:
if tree[3]: # children to process, things to do :)
result = []
### write out pre-elements
endpos = tree[3][0][1] # start of first child
result.append( self.data[ tree[1]:endpos] )
### write children
for child in tree[3]:
result.append( self.emit( child ) )
### write out post elements
startpos = tree[3][-1][2] # end of last child
result.append( self.data[ startpos: tree[2]] )
return string.join( result, '' )
else:
# we're just re-emitting same text...
return self.data[ tree[1]:tree[2]]
def emitexample1( self, tuple ):
'''*+AB+CD -> ++*AC*AD+*BC*BD'''
#print 'interesting'
#import pdb
#pdb.set_trace()
a,b,c,d = map( self.emit, tuple[3] )
#print `(a,b,c,d)`,
return '++*%s%s*%s%s+*%s%s*%s%s'%( a,c,a,d,b,c,b,d)
if __name__ == "__main__":
testdata = [
'''++m*++mkp+f*nkf''',
'''*+ab+cd''',
'''+ab+bc+de''',
'''*ab*bc*de''',
'''++m*++mkp+f*nkf'''*10000,
]
a = Emitter()
import time, profile
for test in testdata:
t = time.time()
a.parse( test )
t = time.time()-t
print 'total time', t, 'length', len(test)
if t:
print ' %s cps' % (len(test)/t)
|