# benchmark
# MSXML: This can be downloaded from many places. You need 3.0
# which is NOT in most newly installed Windows boxes. (650kb)
# http://download.microsoft.com/download/xml/Install/3.0/WIN98Me/EN-US/msxml3.exe
# for a quick tutorial on MSXML 3.0, see
# http://www.perfectxml.com/articles/xml/msxml30.asp
# you should then run the COM MakePY utility on the Pythonwin menu.
# to get it going as fast as possible.
import sys
import glob
import time
import string
from types import TupleType
import cStringIO
def tupleTreeStats(node):
# counts tags and attributes recursively
# use for all reportlab parsers
if node[1] is None:
attrCount = 0
else:
attrCount = len(node[1])
nodeCount = 1
if node[2] is not None:
for child in node[2]:
if type(child) is TupleType:
a, n = tupleTreeStats(child)
attrCount = attrCount + a
nodeCount = nodeCount + n
return attrCount, nodeCount
### pyRXP - our wrapper around Univ of Edinburgh
def getPyRXPParser():
import pyRXP
p = pyRXP.Parser()
return p
def getNonValidatingPyRXPParser():
import pyRXP
p = pyRXP.Parser(Validate=0)
return p
def parseWithPyRXP(parser, rawdata):
return parser.parse(rawdata)
### rparsexml - Aaron's very fast pure python parser
def loadRparseXML():
#it's a module, what the heck
from reportlab.lib import rparsexml
return rparsexml
def parseWithRParseXML(rparsexml, rawdata):
#first argument is a dummy holding none
return rparsexml.parsexml0(rawdata)[0]
### expattree - tree-building wrapper around pyexpat
def getExpatParser():
import expattree
return expattree.ExpatTreeParser()
def parseWithExpat(expatParser, rawdata):
#first argument is a dummy holding none
return expatParser.parse(rawdata)
####### minidom - non-validating DOM parser in the Python distro
def loadMiniDOM():
import xml.dom.minidom
return xml.dom.minidom
def parseWithMiniDOM(dom_module, rawdata):
#parser is None
return dom_module.parseString(rawdata)
def statsWithMiniDOM(node):
return (1, 0)
######### Microsoft XML Parser via COM ######################
def loadMSXML30():
from win32com.client import Dispatch
msx = Dispatch('Microsoft.XMLDOM')
return msx
def parseWithMSXML30(msx, rawdata):
msx.loadXML(rawdata)
return msx
def statsWithMSXML30(node):
#not done
return (1,0)
###########4DOM ###############
def load4DOM():
from xml.dom.ext.reader import PyExpat
from xml.dom import Node
reader = PyExpat.Reader()
return reader
def parseWith4DOM(reader, rawdata):
return reader.fromString(rawdata)
def statsWith4DOM(node):
#node
return (1,0)
def loadCDomlette():
from Ft.Lib import cDomlettec
return cDomlettec
def parseWithCDomlette(modul, rawdata):
io = cStringIO.StringIO(rawdata)
return modul.parse(io, '')
def statsWithCDomlette(node):
#node
return (1,0)
##########put them all together################
TESTMAP = [
# name of parser; function to initialize if needed;
# function to parse; function to do stats
('pyRXP', getPyRXPParser, parseWithPyRXP, tupleTreeStats),
('pyRXP_nonvalidating', getNonValidatingPyRXPParser, parseWithPyRXP, tupleTreeStats),
('rparsexml', loadRparseXML, parseWithRParseXML, tupleTreeStats),
('expat', getExpatParser, parseWithExpat, tupleTreeStats),
('minidom', loadMiniDOM, parseWithMiniDOM, statsWithMiniDOM),
('msxml30', loadMSXML30, parseWithMSXML30, statsWithMSXML30),
('4dom', load4DOM, parseWith4DOM, statsWith4DOM),
('cdomlette', loadCDomlette, parseWithCDomlette, statsWithCDomlette)
]
def interact(testName=None, dtd=1, pause='unknown'):
# if no DTD requested, trim off first 2 lines; the lack of
# a DTD reference will put validating parsers into non-
# validating mode
if dtd:
sampleText = open('rml_a.xml').read()
else:
print 'DTD declaration removed, non-validating'
lines = open('rml_a.xml').readlines()[2:]
sampleText = string.join(lines,'')
if testName:
found = 0
for row in TESTMAP:
if row[0] == testName:
found = 1
(name, loadFunc, parseFunc, statFunc) = row
break
if not found:
print 'parser %s not found, please select' % testName
if not testName:
# interactive, show stuff
print "Interactive benchmark suite for Python XML tree-parsers."
print 'Using sample XML file %d bytes long' % len(sampleText)
print "Parsers available:"
i = 1
for (name, a, b, c) in TESTMAP:
print '\t%d. %s' % (i, name)
i = i + 1
print
inp = raw_input('Parser number (or x to exit) > ')
if inp == 'x':
print 'bye'
return
else:
num = int(inp)
(name, loadFunc, parseFunc, statFunc) = TESTMAP[num-1]
# force pause to 1 or 0 by asking
if pause == 'unknown':
inp = raw_input("Shall we do memory tests? i.e. you look at Task Manager? y/n > ")
assert inp in 'yn', 'enter "y" or "n". Please run again!'
pause = (inp == 'y')
print 'testing %s' % testName
#load the parser
t0 = time.clock()
parser = loadFunc()
loadTime = time.clock() - t0
if pause:
baseMem = float(raw_input("Pre-parsing: please input python process memory in kb > "))
t1 = time.clock()
parsedOutput = parseFunc(parser, sampleText)
t2 = time.clock()
parseTime = t2 - t1
if pause:
totalMem = float(raw_input('Post-parsing: please input python process memory in kb > '))
usedMem = totalMem - baseMem
memFactor = usedMem * 1024.0 / len(sampleText)
t3 = time.clock()
n, a = statFunc(parsedOutput)
t4 = time.clock()
traverseTime = t4 - t3
print 'counted %d tags, %d attributes' % (n, a)
if pause:
print '%s: init %0.4f, parse %0.4f, traverse %0.4f, mem used %dkb, mem factor %0.2f' % (
name, loadTime, parseTime, traverseTime, usedMem, memFactor)
else:
print '%s: init %0.4f, parse %0.4f, traverse %0.4f' % (
name, loadTime, parseTime, traverseTime)
print
if __name__=='__main__':
import sys
args = sys.argv[:]
if '-nodtd' in args:
dtd=0
args.remove('-nodtd')
else:
dtd=1
if '-pause' in args:
pause = 1
args.remove('-pause')
elif '-nopause' in args:
pause = 0
args.remove('-nopause')
else:
pause = 'unknown' # it will ask
if len(args) > 1:
testName = args[1]
else:
testName = None
interact(testName, dtd, pause=pause)
|