# License: BSD
# Created: February 10, 2004
# Author: Francesc Alted - faltet@pytables.com
# $Id: ptrepack.py 3776 2008-10-02 17:50:07Z faltet $
"""This utility lets you repack your data files in a flexible way.
Pass the flag -h to this for help on usage.
import sys
import os.path
import time
import getopt
import warnings
from tables.file import openFile
from tables.group import Group
from tables.leaf import Filters
from tables.exceptions import \
OldIndexWarning, NoIndexingWarning, NoSuchNodeError, FlavorWarning
# Global variables
verbose = False
regoldindexes = True
def newdstGroup(dstfileh, dstgroup, title, filters):
group = dstfileh.root
# Now, create the new group. This works even if dstgroup == '/'
for nodeName in dstgroup.split('/'):
if nodeName == '':
# First try if possible intermediate groups does already exist.
group2 = dstfileh.getNode(group, nodeName)
except NoSuchNodeError:
# The group does not exist. Create it.
group2 = dstfileh.createGroup(group, nodeName,
group = group2
return group
def recreateIndexes(table, dstfileh, dsttable):
listoldindexes = table._listoldindexes
if listoldindexes != []:
if not regoldindexes:
if verbose:
print "[I]Not regenerating indexes for table: '%s:%s'" % \
(dstfileh.filename, dsttable._v_pathname)
# Now, recreate the indexed columns
if verbose:
print "[I]Regenerating indexes for table: '%s:%s'" % \
(dstfileh.filename, dsttable._v_pathname)
for colname in listoldindexes:
if verbose:
print "[I]Indexing column: '%s'. Please wait..." % colname
colobj = dsttable.cols._f_col(colname)
# We don't specify the filters for the indexes
colobj.createIndex(filters = None)
def copyLeaf(srcfile, dstfile, srcnode, dstnode, title,
filters, copyuserattrs, overwritefile, overwrtnodes, stats,
start, stop, step, chunkshape, sortby, forceCSI,
propindexes, upgradeflavors):
# Open the source file
if forceCSI:
srcfileh = openFile(srcfile, 'a')
srcfileh = openFile(srcfile, 'r')
# Get the source node (that should exist)
srcNode = srcfileh.getNode(srcnode)
# Get the destination node and its parent
last_slash = dstnode.rindex('/')
if last_slash == len(dstnode)-1:
# print "Detected a trailing slash in destination node. Interpreting it as a destination group."
dstgroup = dstnode[:-1]
elif last_slash > 0:
dstgroup = dstnode[:last_slash]
dstgroup = "/"
dstleaf = dstnode[last_slash+1:]
if dstleaf == "":
dstleaf = srcNode.name
# Check whether the destination group exists or not
if os.path.isfile(dstfile) and not overwritefile:
dstfileh = openFile(dstfile, 'a')
dstGroup = dstfileh.getNode(dstgroup)
# The dstgroup does not seem to exist. Try creating it.
dstGroup = newdstGroup(dstfileh, dstgroup, title, filters)
# The node exists, but it is really a group?
if not isinstance(dstGroup, Group):
# No. Should we overwrite it?
if overwrtnodes:
parent = dstGroup._v_parent
last_slash = dstGroup._v_pathname.rindex('/')
dstgroupname = dstGroup._v_pathname[last_slash+1:]
dstGroup = dstfileh.createGroup(parent, dstgroupname,
raise RuntimeError, "Please check that the node names are not duplicated in destination, and if so, add the --overwrite-nodes flag if desired."
# The destination file does not exist or will be overwritten.
dstfileh = openFile(dstfile, 'w', title=title, filters=filters)
dstGroup = newdstGroup(dstfileh, dstgroup, title="", filters=filters)
# Finally, copy srcNode to dstNode
dstNode = srcNode.copy(
dstGroup, dstleaf, filters = filters,
copyuserattrs = copyuserattrs, overwrite = overwrtnodes,
stats = stats, start = start, stop = stop, step = step,
chunkshape = chunkshape,
sortby = sortby, forceCSI = forceCSI, propindexes = propindexes)
(type, value, traceback) = sys.exc_info()
print "Problems doing the copy from '%s:%s' to '%s:%s'" % \
(srcfile, srcnode, dstfile, dstnode)
print "The error was --> %s: %s" % (type, value)
print "The destination file looks like:\n", dstfileh
# Close all the open files:
raise RuntimeError, "Please check that the node names are not duplicated in destination, and if so, add the --overwrite-nodes flag if desired."
# Upgrade flavors in dstNode, if required
if upgradeflavors and srcfileh.format_version.startswith("1"):
# Remove original flavor in case the source file has 1.x format
# Recreate possible old indexes in destination node
if srcNode._c_classId == "TABLE":
recreateIndexes(srcNode, dstfileh, dstNode)
# Close all the open files:
def copyChildren(srcfile, dstfile, srcgroup, dstgroup, title,
recursive, filters, copyuserattrs, overwritefile,
overwrtnodes, stats, start, stop, step,
chunkshape, sortby, forceCSI, propindexes,
"Copy the children from source group to destination group"
# Open the source file with srcgroup as rootUEP
if forceCSI:
srcfileh = openFile(srcfile, 'a', rootUEP=srcgroup)
srcfileh = openFile(srcfile, 'r', rootUEP=srcgroup)
# Assign the root to srcGroup
srcGroup = srcfileh.root
created_dstGroup = False
# Check whether the destination group exists or not
if os.path.isfile(dstfile) and not overwritefile:
dstfileh = openFile(dstfile, 'a')
dstGroup = dstfileh.getNode(dstgroup)
# The dstgroup does not seem to exist. Try creating it.
dstGroup = newdstGroup(dstfileh, dstgroup, title, filters)
created_dstGroup = True
# The node exists, but it is really a group?
if not isinstance(dstGroup, Group):
# No. Should we overwrite it?
if overwrtnodes:
parent = dstGroup._v_parent
last_slash = dstGroup._v_pathname.rindex('/')
dstgroupname = dstGroup._v_pathname[last_slash+1:]
dstGroup = dstfileh.createGroup(parent, dstgroupname,
raise RuntimeError, "Please check that the node names are not duplicated in destination, and if so, add the --overwrite-nodes flag if desired."
# The destination file does not exist or will be overwritten.
dstfileh = openFile(dstfile, 'w', title=title, filters=filters)
dstGroup = newdstGroup(dstfileh, dstgroup, title="", filters=filters)
created_dstGroup = True
# Copy the attributes to dstGroup, if needed
if created_dstGroup and copyuserattrs:
# Finally, copy srcGroup children to dstGroup
dstGroup, recursive = recursive, filters = filters,
copyuserattrs = copyuserattrs, overwrite = overwrtnodes,
stats = stats, start = start, stop = stop, step = step,
chunkshape = chunkshape,
sortby = sortby, forceCSI = forceCSI, propindexes = propindexes)
(type, value, traceback) = sys.exc_info()
print "Problems doing the copy from '%s:%s' to '%s:%s'" % \
(srcfile, srcgroup, dstfile, dstgroup)
print "The error was --> %s: %s" % (type, value)
print "The destination file looks like:\n", dstfileh
# Close all the open files:
raise RuntimeError, "Please check that the node names are not duplicated in destination, and if so, add the --overwrite-nodes flag if desired. In particular, pay attention that rootUEP is not fooling you."
# Upgrade flavors in dstNode, if required
if upgradeflavors and srcfileh.format_version.startswith("1"):
for dstNode in dstGroup._f_walkNodes("Leaf"):
# Remove original flavor in case the source file has 1.x format
# Convert the remaining tables with old indexes (if any)
for table in srcGroup._f_walkNodes("Table"):
dsttable = dstfileh.getNode(dstGroup, table._v_pathname)
recreateIndexes(table, dstfileh, dsttable)
# Close all the open files:
def main():
global verbose
global regoldindexes
usage = """usage: %s [-h] [-v] [-o] [-R start,stop,step] [--non-recursive] [--dest-title=title] [--dont-copyuser-attrs] [--overwrite-nodes] [--complevel=(0-9)] [--complib=lib] [--shuffle=(0|1)] [--fletcher32=(0|1)] [--keep-source-filters] [--chunkshape=value] [--upgrade-flavors] [--dont-regenerate-old-indexes] [--sortby=column] [--forceCSI] [--propindexes] sourcefile:sourcegroup destfile:destgroup
-h -- Print usage message.
-v -- Show more information.
-o -- Overwite destination file.
-R RANGE -- Select a RANGE of rows (in the form "start,stop,step")
during the copy of *all* the leaves. Default values are
"None,None,1", which means a copy of all the rows.
--non-recursive -- Do not do a recursive copy. Default is to do it.
--dest-title=title -- Title for the new file (if not specified,
the source is copied).
--dont-copy-userattrs -- Do not copy the user attrs (default is to do it)
--overwrite-nodes -- Overwrite destination nodes if they exist. Default is
to not overwrite them.
--complevel=(0-9) -- Set a compression level (0 for no compression, which
is the default).
--complib=lib -- Set the compression library to be used during the copy.
lib can be set to "zlib", "lzo" or "bzip2". Defaults to "zlib".
--shuffle=(0|1) -- Activate or not the shuffling filter (default is active
if complevel>0).
--fletcher32=(0|1) -- Whether to activate or not the fletcher32 filter
(not active by default).
--keep-source-filters -- Use the original filters in source files. The
default is not doing that if any of --complevel, --complib, --shuffle
or --fletcher32 option is specified.
--chunkshape=("keep"|"auto"|int|tuple) -- Set a chunkshape. A value
of "auto" computes a sensible value for the chunkshape of the
leaves copied. The default is to "keep" the original value.
--upgrade-flavors -- When repacking PyTables 1.x files, the flavor of
leaves will be unset. With this, such a leaves will be serialized
as objects with the internal flavor ('numpy' for 2.x series).
--dont-regenerate-old-indexes -- Disable regenerating old indexes. The
default is to regenerate old indexes as they are found.
--sortby=column -- Do a table copy sorted by the values of "column".
This requires an existing index in "column". For reversing the order,
use a negative value in the "step" part of "RANGE" (see "-R" flag).
Only applies to table objects.
--forceCSI -- Force the creation of a CSI index in case one is not
available for the --sortby column (this implies the modification of
the *source* file). The default is to not create it.
--propindexes -- Propagate the indexes existing in original tables. The
default is to not propagate them. Only applies to table objects.
\n""" % os.path.basename(sys.argv[0])
opts, pargs = getopt.getopt(sys.argv[1:], 'hvoR:',
(type, value, traceback) = sys.exc_info()
print "Error parsing the options. The error was:", value
# default options
overwritefile = False
keepfilters = False
chunkshape = "keep"
complevel = None
complib = None
shuffle = None
fletcher32 = None
title = ""
copyuserattrs = True
rng = None
recursive = True
overwrtnodes = False
upgradeflavors = False
sortby = None
forceCSI = False
propindexes = False
# Get the options
for option in opts:
if option[0] == '-h':
elif option[0] == '-v':
verbose = True
elif option[0] == '-o':
overwritefile = True
elif option[0] == '-R':
rng = eval("slice("+option[1]+")")
print "Error when getting the range parameter."
(type, value, traceback) = sys.exc_info()
print " The error was:", value
elif option[0] == '--dest-title':
title = option[1]
elif option[0] == '--dont-copy-userattrs':
copyuserattrs = False
elif option[0] == '--non-recursive':
recursive = False
elif option[0] == '--overwrite-nodes':
overwrtnodes = True
elif option[0] == '--keep-source-filters':
keepfilters = True
elif option[0] == '--chunkshape':
chunkshape = option[1]
if chunkshape.isdigit() or chunkshape.startswith('('):
chunkshape = eval(chunkshape)
elif option[0] == '--upgrade-flavors':
upgradeflavors = True
elif option[0] == '--dont-regenerate-old-indexes':
regoldindexes = False
elif option[0] == '--complevel':
complevel = int(option[1])
elif option[0] == '--complib':
complib = option[1]
elif option[0] == '--shuffle':
shuffle = int(option[1])
elif option[0] == '--fletcher32':
fletcher32 = int(option[1])
elif option[0] == '--sortby':
sortby = option[1]
elif option[0] == '--propindexes':
propindexes = True
elif option[0] == '--forceCSI':
forceCSI = True
print option[0], ": Unrecognized option"
# if we pass a number of files different from 2, abort
if len(pargs) <> 2:
print "You need to pass both source and destination!."
# Catch the files passed as the last arguments
src = pargs[0].split(':')
dst = pargs[1].split(':')
if len(src) == 1:
srcfile, srcnode = src[0], "/"
srcfile, srcnode = src
if len(dst) == 1:
dstfile, dstnode = dst[0], "/"
dstfile, dstnode = dst
if srcnode == "":
# case where filename == "filename:" instead of "filename:/"
srcnode = "/"
if dstnode == "":
# case where filename == "filename:" instead of "filename:/"
dstnode = "/"
# Ignore the warnings for tables that contains oldindexes
# (these will be handled by the copying routines)
warnings.filterwarnings("ignore", category=OldIndexWarning)
# Let the user be warned in case he is using ptrepack when copying
# files with indexes
#warnings.filterwarnings("ignore", category=NoIndexingWarning)
# Ignore the flavors warnings during upgrading flavor operations
if upgradeflavors:
warnings.filterwarnings("ignore", category=FlavorWarning)
# Build the Filters instance
if ((complevel, complib, shuffle, fletcher32) == (None,)*4 or keepfilters):
filters = None
if complevel is None: complevel = 0
if shuffle is None:
if complevel > 0:
shuffle = True
shuffle = False
if complib is None: complib = "zlib"
if fletcher32 is None: fletcher32 = False
filters = Filters(complevel=complevel, complib=complib,
shuffle=shuffle, fletcher32=fletcher32)
# The start, stop and step params:
start, stop, step = None, None, 1 # Defaults
if rng:
start, stop, step = rng.start, rng.stop, rng.step
# Some timing
t1 = time.time()
cpu1 = time.clock()
# Copy the file
if verbose:
print "+=+"*20
print "Recursive copy:", recursive
print "Applying filters:", filters
if sortby is not None:
print "Sorting table(s) by column:", sortby
print "Forcing a CSI creation:", forceCSI
if propindexes:
print "Recreating indexes in copied table(s)"
print "Start copying %s:%s to %s:%s" % (srcfile, srcnode,
dstfile, dstnode)
print "+=+"*20
# Check whether the specified source node is a group or a leaf
h5srcfile = openFile(srcfile, 'r')
srcnodeobject = h5srcfile.getNode(srcnode)
objectclass = srcnodeobject.__class__.__name__
# Close the file again
stats = {'groups': 0, 'leaves': 0, 'bytes': 0}
if isinstance(srcnodeobject, Group):
srcfile, dstfile, srcnode, dstnode,
title = title, recursive = recursive, filters = filters,
copyuserattrs = copyuserattrs, overwritefile = overwritefile,
overwrtnodes = overwrtnodes, stats = stats,
start = start, stop = stop, step = step, chunkshape = chunkshape,
sortby = sortby, forceCSI = forceCSI, propindexes = propindexes,
# If not a Group, it should be a Leaf
srcfile, dstfile, srcnode, dstnode,
title = title, filters = filters, copyuserattrs = copyuserattrs,
overwritefile = overwritefile, overwrtnodes = overwrtnodes,
stats = stats, start = start, stop = stop, step = step,
chunkshape = chunkshape,
sortby = sortby, forceCSI = forceCSI, propindexes = propindexes,
# Gather some statistics
t2 = time.time()
cpu2 = time.clock()
tcopy = round(t2-t1, 3)
cpucopy = round(cpu2-cpu1, 3)
tpercent = int(round(cpucopy/tcopy, 2)*100)
if verbose:
ngroups = stats['groups']
nleafs = stats['leaves']
nbytescopied = stats['bytes']
print "Groups copied:", ngroups, " Leaves copied:", nleafs
if copyuserattrs:
print "User attrs copied"
print "User attrs not copied"
print "KBytes copied:", round(nbytescopied/1024.,3)
print "Time copying: %s s (real) %s s (cpu) %s%%" % \
(tcopy, cpucopy, tpercent)
print "Copied nodes/sec: ", round((ngroups+nleafs) / float(tcopy),1)
print "Copied KB/s :", int(nbytescopied / (tcopy * 1024))