import numpy as np
import tables
from time import time
N = 1000*1000
NCOLL = 200 # 200 collections maximum
# In order to have reproducible results
np.random.seed(19)
class Energies(tables.IsDescription):
collection = tables.UInt8Col()
energy = tables.Float64Col()
def fill_bucket(lbucket):
#c = np.random.normal(NCOLL/2, NCOLL/10, lbucket)
c = np.random.normal(NCOLL/2, NCOLL/100, lbucket)
e = np.arange(lbucket, dtype='f8')
return c, e
# Fill the table
t1 = time()
f = tables.openFile("data.nobackup/collations.h5", "w")
table = f.createTable("/", "Energies", Energies, expectedrows=N)
# Fill the table with values
lbucket = 1000 # Fill in buckets of 1000 rows, for speed
for i in xrange(0, N, lbucket):
bucket = fill_bucket(lbucket)
table.append(bucket)
# Fill the remaining rows
bucket = fill_bucket(N%lbucket)
table.append(bucket)
f.close()
print "Time to create the table with %d entries: %.3f" % (N, time()-t1)
# Now, read the table and group it by collection
f = tables.openFile("data.nobackup/collations.h5", "a")
table = f.root.Energies
#########################################################
# First solution: load the table completely in memory
#########################################################
t1 = time()
t = table[:] # convert to structured array
coll1 = []
collections = np.unique(t['collection'])
for c in collections:
cond = t['collection'] == c
energy_this_collection = t['energy'][cond]
sener = energy_this_collection.sum()
coll1.append(sener)
print c,' : ', sener
del collections, energy_this_collection
print "Time for first solution: %.3f" % (time()-t1)
#########################################################
# Second solution: load all the collections in memory
#########################################################
t1 = time()
collections = {}
for row in table:
c = row['collection']
e = row['energy']
if c in collections:
collections[c].append(e)
else:
collections[c] = [e]
# Convert the lists in numpy arrays
coll2 = []
for c in sorted(collections):
energy_this_collection = np.array(collections[c])
sener = energy_this_collection.sum()
coll2.append(sener)
print c,' : ', sener
del collections, energy_this_collection
print "Time for second solution: %.3f" % (time()-t1)
if tables.is_pro:
t1 = time()
table.cols.collection.createCSIndex()
#table.cols.collection.reIndex()
print "Time for indexing: %.3f" % (time()-t1)
#########################################################
# Third solution: load each collection separately
#########################################################
t1 = time()
coll3 = []
for c in np.unique(table.col('collection')) :
energy_this_collection = table.readWhere('collection == c', field='energy')
sener = energy_this_collection.sum()
coll3.append(sener)
print c,' : ', sener
del energy_this_collection
print "Time for third solution: %.3f" % (time()-t1)
if tables.is_pro:
t1 = time()
table2 = table.copy('/', 'EnergySortedByCollation', overwrite=True,
sortby="collection", propindexes=True)
print "Time for sorting: %.3f" % (time()-t1)
else:
table2 = table
#####################################################################
# Fourth solution: load each collection separately. Sorted table.
#####################################################################
t1 = time()
coll4 = []
for c in np.unique(table2.col('collection')) :
energy_this_collection = table2.readWhere(
'collection == c', field='energy')
sener = energy_this_collection.sum()
coll4.append(sener)
print c,' : ', sener
del energy_this_collection
print "Time for fourth solution: %.3f" % (time()-t1)
# Finally, check that all solutions do match
assert coll1 == coll2 == coll3 == coll4
f.close()
|