search bench.py : » Database » PyTables » tables-2.1.2 » bench » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » Database » PyTables
PyTables » tables 2.1.2 » bench » search-bench.py
#!/usr/bin/env python

import sys

import time
from tables import *
import random
import math
import warnings
import numpy

# Initialize the random generator always with the same integer
# in order to have reproductible results
random.seed(19)
numpy.random.seed(19)

randomvalues = 0
worst=0

Small = {
    "var1" : StringCol(itemsize=4, dflt="Hi!", pos=2),
    "var2" : Int32Col(pos=1),
    "var3" : Float64Col(pos=0),
    #"var4" : BoolCol(),
    }

def createNewBenchFile(bfile, verbose):

    class Create(IsDescription):
        nrows   = Int32Col(pos=0)
        irows   = Int32Col(pos=1)
        tfill   = Float64Col(pos=2)
        tidx    = Float64Col(pos=3)
        tcfill  = Float64Col(pos=4)
        tcidx   = Float64Col(pos=5)
        rowsecf = Float64Col(pos=6)
        rowseci = Float64Col(pos=7)
        fsize   = Float64Col(pos=8)
        isize   = Float64Col(pos=9)
        psyco   = BoolCol(pos=10)

    class Search(IsDescription):
        nrows   = Int32Col(pos=0)
        rowsel  = Int32Col(pos=1)
        time1   = Float64Col(pos=2)
        time2   = Float64Col(pos=3)
        tcpu1   = Float64Col(pos=4)
        tcpu2   = Float64Col(pos=5)
        rowsec1 = Float64Col(pos=6)
        rowsec2 = Float64Col(pos=7)
        psyco   = BoolCol(pos=8)

    if verbose:
        print "Creating a new benchfile:", bfile
    # Open the benchmarking file
    bf = openFile(bfile, "w")
    # Create groups
    for recsize in ["small"]:
        group = bf.createGroup("/", recsize, recsize+" Group")
        # Attach the row size of table as attribute
        if recsize == "small":
            group._v_attrs.rowsize = 16
        # Create a Table for writing bench
        bf.createTable(group, "create_best", Create, "best case")
        bf.createTable(group, "create_worst", Create, "worst case")
        for case in ["best","worst"]:
            # create a group for searching bench (best case)
            groupS = bf.createGroup(group, "search_"+case, "Search Group")
            # Create Tables for searching
            for mode in ["indexed", "inkernel", "standard"]:
                groupM = bf.createGroup(groupS, mode, mode+" Group")
                # for searching bench
                #for atom in ["string", "int", "float", "bool"]:
                for atom in ["string", "int", "float"]:
                    bf.createTable(groupM, atom, Search, atom+" bench")
    bf.close()

def createFile(filename, nrows, filters, index, heavy, noise, verbose):

    # Open a file in "w"rite mode
    fileh = openFile(filename, mode = "w", title="Searchsorted Benchmark",
                     filters=filters)
    rowswritten = 0

    # Create the test table
    table = fileh.createTable(fileh.root, 'table', Small, "test table",
                              None, nrows)

    t1 = time.time()
    cpu1 = time.clock()
    nrowsbuf = table.nrowsinbuf
    minimum = 0
    maximum = nrows
    for i in xrange(0, nrows, nrowsbuf):
        if i+nrowsbuf > nrows:
            j = nrows
        else:
            j = i+nrowsbuf
        if randomvalues:
            var3 = numpy.random.uniform(minimum, maximum, size=j-i)
        else:
            var3 = numpy.arange(i, j, dtype=numpy.float64)
            if noise > 0:
                var3 += numpy.random.uniform(-noise, noise, size=j-i)
        var2 = numpy.array(var3, dtype=numpy.int32)
        var1 = numpy.empty(shape=[j-i], dtype="S4")
        if not heavy:
            var1[:] = var2
        table.append([var3, var2, var1])
    table.flush()
    rowswritten += nrows
    time1 = time.time()-t1
    tcpu1 = time.clock()-cpu1
    print "Time for filling:", round(time1,3),\
          "Krows/s:", round(nrows/1000./time1,3),
    fileh.close()
    size1 = os.stat(filename)[6]
    print ", File size:", round(size1/(1024.*1024.), 3), "MB"
    fileh = openFile(filename, mode = "a", title="Searchsorted Benchmark",
                     filters=filters)
    table = fileh.root.table
    rowsize = table.rowsize
    if index:
        t1 = time.time()
        cpu1 = time.clock()
        # Index all entries
        if not heavy:
            indexrows = table.cols.var1.createIndex(filters=filters)
        for colname in ['var2', 'var3']:
            table.colinstances[colname].createIndex(filters=filters)
        time2 = time.time()-t1
        tcpu2 = time.clock()-cpu1
        print "Time for indexing:", round(time2,3), \
              "iKrows/s:", round(indexrows/1000./time2,3),
    else:
        indexrows = 0
        time2 = 0.0000000001  # an ugly hack
        tcpu2 = 0.

    if verbose:
        if index:
            idx = table.cols.var1.index
            print "Index parameters:", repr(idx)
        else:
            print "NOT indexing rows"
    # Close the file
    fileh.close()

    size2 = os.stat(filename)[6] - size1
    if index:
        print ", Index size:", round(size2/(1024.*1024.), 3), "MB"
    return (rowswritten, indexrows, rowsize, time1, time2,
            tcpu1, tcpu2, size1, size2)

def benchCreate(file, nrows, filters, index, bfile, heavy,
                psyco, noise, verbose):

    # Open the benchfile in append mode
    bf = openFile(bfile,"a")
    recsize = "small"
    if worst:
        table = bf.getNode("/"+recsize+"/create_worst")
    else:
        table = bf.getNode("/"+recsize+"/create_best")

    (rowsw, irows, rowsz, time1, time2, tcpu1, tcpu2, size1, size2) = \
          createFile(file, nrows, filters, index, heavy, noise, verbose)
    # Collect data
    table.row["nrows"] = rowsw
    table.row["irows"] = irows
    table.row["tfill"] = time1
    table.row["tidx"]  = time2
    table.row["tcfill"] = tcpu1
    table.row["tcidx"] = tcpu2
    table.row["fsize"] = size1
    table.row["isize"] = size2
    table.row["psyco"] = psyco
    tapprows = round(time1, 3)
    cpuapprows = round(tcpu1, 3)
    tpercent = int(round(cpuapprows/tapprows, 2)*100)
    print "Rows written:", rowsw, " Row size:", rowsz
    print "Time writing rows: %s s (real) %s s (cpu)  %s%%" % \
          (tapprows, cpuapprows, tpercent)
    rowsecf = rowsw / tapprows
    table.row["rowsecf"] = rowsecf
    #print "Write rows/sec: ", rowsecf
    print "Total file size:", round((size1+size2)/(1024.*1024.), 3), "MB",
    print ", Write KB/s (pure data):", int(rowsw * rowsz / (tapprows * 1024))
    #print "Write KB/s :", int((size1+size2) / ((time1+time2) * 1024))
    tidxrows = time2
    cpuidxrows = round(tcpu2, 3)
    tpercent = int(round(cpuidxrows/tidxrows, 2)*100)
    print "Rows indexed:", irows, " (IMRows):", irows / float(10**6)
    print "Time indexing rows: %s s (real) %s s (cpu)  %s%%" % \
          (round(tidxrows,3), cpuidxrows, tpercent)
    rowseci = irows / tidxrows
    table.row["rowseci"] = rowseci
    table.row.append()
    bf.close()

def readFile(filename, atom, riter, indexmode, dselect, verbose):
    # Open the HDF5 file in read-only mode

    fileh = openFile(filename, mode = "r")
    table = fileh.root.table
    var1 = table.cols.var1
    var2 = table.cols.var2
    var3 = table.cols.var3
    if indexmode == "indexed":
        if var2.index.nelements > 0:
            where = table._whereIndexed
        else:
            warnings.warn("Not indexed table or empty index. Defaulting to in-kernel selection")
            indexmode = "inkernel"
            where = table._whereInRange
    elif indexmode == "inkernel":
        where = table.where
    if verbose:
        print "Max rows in buf:", table.nrowsinbuf
        print "Rows in", table._v_pathname, ":", table.nrows
        print "Buffersize:", table.rowsize * table.nrowsinbuf
        print "MaxTuples:", table.nrowsinbuf
        if indexmode == "indexed":
            print "Chunk size:", var2.index.sorted.chunksize
            print "Number of elements per slice:", var2.index.nelemslice
            print "Slice number in", table._v_pathname, ":", var2.index.nrows

    #table.nrowsinbuf = 10
    #print "nrowsinbuf-->", table.nrowsinbuf
    rowselected = 0
    time2 = 0.
    tcpu2 = 0.
    results = []
    print "Select mode:", indexmode, ". Selecting for type:", atom
    # Initialize the random generator always with the same integer
    # in order to have reproductible results on each read iteration
    random.seed(19)
    numpy.random.seed(19)
    for i in xrange(riter):
        # The interval for look values at. This is aproximately equivalent to
        # the number of elements to select
        rnd = numpy.random.randint(table.nrows)
        cpu1 = time.clock()
        t1 = time.time()
        if atom == "string":
            val = str(rnd)[-4:]
            if indexmode in ["indexed", "inkernel"]:
                results = [p.nrow
                           for p in where('var1 == val')]
            else:
                results = [p.nrow for p in table
                           if p["var1"] == val]
        elif atom == "int":
            val = rnd+dselect
            if indexmode in ["indexed", "inkernel"]:
                results = [p.nrow
                           for p in where('(rnd <= var3) & (var3 < val)')]
            else:
                results = [p.nrow for p in table
                           if rnd <= p["var2"] < val]
        elif atom == "float":
            val = rnd+dselect
            if indexmode in ["indexed", "inkernel"]:
                t1=time.time()
                results = [p.nrow
                           for p in where('(rnd <= var3) & (var3 < val)')]
            else:
                results = [p.nrow for p in table
                           if float(rnd) <= p["var3"] < float(val)]
        else:
            raise ValueError, "Value for atom '%s' not supported." % atom
        rowselected += len(results)
        #print "selected values-->", results
        if i == 0:
            # First iteration
            time1 = time.time() - t1
            tcpu1 = time.clock() - cpu1
        else:
            if indexmode == "indexed":
                # if indexed, wait until the 5th iteration (in order to
                # insure that the index is effectively cached) to take times
                if i >= 5:
                    time2 += time.time() - t1
                    tcpu2 += time.clock() - cpu1
            else:
                time2 += time.time() - t1
                tcpu2 += time.clock() - cpu1

    if riter > 1:
        if indexmode == "indexed" and riter >= 5:
            correction = 5
        else:
            correction = 1
        time2 = time2 / (riter - correction)
        tcpu2 = tcpu2 / (riter - correction)
    if verbose and 1:
        print "Values that fullfill the conditions:"
        print results

    #rowsread = table.nrows * riter
    rowsread = table.nrows
    rowsize = table.rowsize

    # Close the file
    fileh.close()

    return (rowsread, rowselected, rowsize, time1, time2, tcpu1, tcpu2)

def benchSearch(file, riter, indexmode, bfile, heavy, psyco, dselect, verbose):

    # Open the benchfile in append mode
    bf = openFile(bfile,"a")
    recsize = "small"
    if worst:
        tableparent = "/"+recsize+"/search_worst/"+indexmode+"/"
    else:
        tableparent = "/"+recsize+"/search_best/"+indexmode+"/"

    # Do the benchmarks
    if not heavy:
        #atomlist = ["string", "int", "float", "bool"]
        atomlist = ["string", "int", "float"]
    else:
        #atomlist = ["int", "float", "bool"]
        atomlist = ["int", "float"]
    for atom in atomlist:
        tablepath = tableparent + atom
        table = bf.getNode(tablepath)
        (rowsr, rowsel, rowssz, time1, time2, tcpu1, tcpu2) = \
                readFile(file, atom, riter, indexmode, dselect, verbose)
        row = table.row
        row["nrows"] = rowsr
        row["rowsel"] = rowsel
        treadrows = round(time1, 6)
        row["time1"] = time1
        treadrows2 = round(time2, 6)
        row["time2"] = time2
        cpureadrows = round(tcpu1, 6)
        row["tcpu1"] = tcpu1
        cpureadrows2 = round(tcpu2, 6)
        row["tcpu2"] = tcpu2
        row["psyco"] = psyco
        tpercent = int(round(cpureadrows/treadrows, 2)*100)
        if riter > 1:
            tpercent2 = int(round(cpureadrows2/treadrows2, 2)*100)
        else:
            tpercent2 = 0.
        tMrows = rowsr / (1000*1000.)
        sKrows = rowsel / 1000.
        if atom == "string": # just to print once
            print "Rows read:", rowsr, "Mread:", round(tMrows, 6), "Mrows"
        print "Rows selected:", rowsel, "Ksel:", round(sKrows,6), "Krows"
        print "Time selecting (1st time): %s s (real) %s s (cpu)  %s%%" % \
              (treadrows, cpureadrows, tpercent)
        if riter > 1:
            print "Time selecting (cached): %s s (real) %s s (cpu)  %s%%" % \
                  (treadrows2, cpureadrows2, tpercent2)
        #rowsec1 = round(rowsr / float(treadrows), 6)/10**6
        rowsec1 = rowsr / treadrows
        row["rowsec1"] = rowsec1
        print "Read Mrows/sec: ",
        print round(rowsec1 / 10.**6, 6), "(first time)",
        if riter > 1:
            rowsec2 = rowsr / treadrows2
            row["rowsec2"] = rowsec2
            print round(rowsec2 / 10.**6, 6), "(cache time)"
        else:
            print
        # Append the info to the table
        row.append()
        table.flush()
    # Close the benchmark file
    bf.close()

if __name__=="__main__":
    import sys
    import os.path
    import getopt
    try:
        import psyco
        psyco_imported = 1
    except:
        psyco_imported = 0

    import time

    usage = """usage: %s [-v] [-p] [-R] [-r] [-w] [-c level] [-l complib] [-S] [-F] [-n nrows] [-x] [-b file] [-t] [-h] [-k riter] [-m indexmode] [-N range] [-d range] datafile
            -v verbose
            -p use "psyco" if available
            -R use Random values for filling
            -r only read test
            -w only write test
            -c sets a compression level (do not set it or 0 for no compression)
            -l sets the compression library ("zlib", "lzo", "ucl", "bzip2" or "none")
            -S activate shuffling filter
            -F activate fletcher32 filter
            -n set the number of rows in tables (in krows)
            -x don't make indexes
            -b bench filename
            -t worsT searching case
            -h heavy benchmark (operations without strings)
            -m index mode for reading ("indexed" | "inkernel" | "standard")
            -N introduce (uniform) noise within range into the values
            -d the interval for look values (int, float) at. Default is 3.
            -k number of iterations for reading\n""" % sys.argv[0]

    try:
        opts, pargs = getopt.getopt(sys.argv[1:], 'vpSFRrowxthk:b:c:l:n:m:N:d:')
    except:
        sys.stderr.write(usage)
        sys.exit(0)

    # if we pass too much parameters, abort
    if len(pargs) <> 1:
        sys.stderr.write(usage)
        sys.exit(0)

    # default options
    dselect = 3.
    noise = 0.
    verbose = 0
    fieldName = None
    testread = 1
    testwrite = 1
    usepsyco = 0
    complevel = 0
    shuffle = 0
    fletcher32 = 0
    complib = "zlib"
    nrows = 1000
    index = 1
    heavy = 0
    bfile = "bench.h5"
    supported_imodes = ["indexed", "inkernel", "standard"]
    indexmode = "inkernel"
    riter = 1

    # Get the options
    for option in opts:
        if option[0] == '-v':
            verbose = 1
        if option[0] == '-p':
            usepsyco = 1
        if option[0] == '-R':
            randomvalues = 1
        if option[0] == '-S':
            shuffle = 1
        if option[0] == '-F':
            fletcher32 = 1
        elif option[0] == '-r':
            testwrite = 0
        elif option[0] == '-w':
            testread = 0
        elif option[0] == '-x':
            index = 0
        elif option[0] == '-h':
            heavy = 1
        elif option[0] == '-t':
            worst = 1
        elif option[0] == '-b':
            bfile = option[1]
        elif option[0] == '-c':
            complevel = int(option[1])
        elif option[0] == '-l':
            complib = option[1]
        elif option[0] == '-m':
            indexmode = option[1]
            if indexmode not in supported_imodes:
                raise ValueError, "Indexmode should be any of '%s' and you passed '%s'" % (supported_imodes, indexmode)
        elif option[0] == '-n':
            nrows = int(float(option[1])*1000)
        elif option[0] == '-N':
            noise = float(option[1])
        elif option[0] == '-d':
            dselect = float(option[1])
        elif option[0] == '-k':
            riter = int(option[1])

    if worst:
        nrows -= 1  # the worst case

    if complib == "none":
        # This means no compression at all
        complib="zlib"  # just to make PyTables not complaining
        complevel=0

    # Catch the hdf5 file passed as the last argument
    file = pargs[0]

    # Build the Filters instance
    filters = Filters(complevel=complevel, complib=complib,
                      shuffle=shuffle, fletcher32=fletcher32)

    # Create the benchfile (if needed)
    if not os.path.exists(bfile):
        createNewBenchFile(bfile, verbose)

    if testwrite:
        if verbose:
            print "Compression level:", complevel
            if complevel > 0:
                print "Compression library:", complib
                if shuffle:
                    print "Suffling..."
        if psyco_imported and usepsyco:
            psyco.bind(createFile)
        benchCreate(file, nrows, filters, index, bfile, heavy,
                    usepsyco, noise, verbose)
    if testread:
        if psyco_imported and usepsyco:
            psyco.bind(readFile)
        benchSearch(file, riter, indexmode, bfile, heavy, usepsyco,
                    dselect, verbose)
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.