#!/usr/bin/env python
# Benchmark the effect of chunkshapes in reading large datasets.
# You need at least PyTables 2.1 to run this!
# F. Alted
import numpy, tables
from time import time
dim1, dim2 = 360, 6109666
rows_to_read = range(0, 360, 36)
print "="*32
# Create the EArray
f = tables.openFile("/tmp/test.h5", "w")
a = f.createEArray(f.root, "a", tables.Float64Atom(), shape = (dim1, 0),
expectedrows=dim2)
print "Chunkshape for original array:", a.chunkshape
# Fill the EArray
t1 = time()
zeros = numpy.zeros((dim1,1), dtype="float64")
for i in xrange(dim2):
a.append(zeros)
tcre = round(time()-t1, 3)
thcre = round(dim1*dim2*8 / (tcre * 1024 * 1024), 1)
print "Time to append %d rows: %s sec (%s MB/s)" % (a.nrows, tcre, thcre)
# Read some row vectors from the original array
t1 = time()
for i in rows_to_read: r1 = a[i,:]
tr1 = round(time()-t1, 3)
thr1 = round(dim2*len(rows_to_read)*8 / (tr1 * 1024 * 1024), 1)
print "Time to read ten rows in original array: %s sec (%s MB/s)" % (tr1, thr1)
print "="*32
# Copy the array to another with a row-wise chunkshape
t1 = time()
#newchunkshape = (1, a.chunkshape[0]*a.chunkshape[1])
newchunkshape = (1, a.chunkshape[0]*a.chunkshape[1]*10) # ten times larger
b = a.copy(f.root, "b", chunkshape=newchunkshape)
tcpy = round(time()-t1, 3)
thcpy = round(dim1*dim2*8 / (tcpy * 1024 * 1024), 1)
print "Chunkshape for row-wise chunkshape array:", b.chunkshape
print "Time to copy the original array: %s sec (%s MB/s)" % (tcpy, thcpy)
# Read the same ten rows from the new copied array
t1 = time()
for i in rows_to_read: r2 = b[i,:]
tr2 = round(time()-t1, 3)
thr2 = round(dim2*len(rows_to_read)*8 / (tr2 * 1024 * 1024), 1)
print "Time to read with a row-wise chunkshape: %s sec (%s MB/s)" % (tr2, thr2)
print "="*32
print "Speed-up with a row-wise chunkshape:", round(tr1/tr2, 1)
f.close()
|