# License: BSD
# Created: June 15, 2005
# Author: Antonio Valentino
# Modified by: Francesc Alted
# $Id: carray.py 3933 2008-12-09 16:06:28Z faltet $
"""Here is defined the CArray class.
See CArray class docstring for more info.
Misc variables:
import sys, warnings
import numpy
from tables.utilsExtension import lrange
from tables.atom import Atom,EnumAtom,split_type
from tables.leaf import Leaf
from tables.array import Array
from tables.utils import correct_byteorder,SizeType
__version__ = "$Revision: 3933 $"
# default version for CARRAY objects
obversion = "1.0" # Support for time & enumerated datatypes.
class CArray(Array):
This class represents homogeneous datasets in an HDF5 file.
The difference between a `CArray` and a normal `Array`, from which
it inherits, is that a `CArray` has a chunked layout and, as a
consequence, it supports compression. You can use datasets of
this class to easily save or load arrays to or from disk, with
compression support included.
Example of use
See below a small example of the use of the `CArray` class. The
code is available in ``examples/carray1.py``::
import numpy
import tables
fileName = 'carray1.h5'
shape = (200, 300)
atom = tables.UInt8Atom()
filters = tables.Filters(complevel=5, complib='zlib')
h5f = tables.openFile(fileName, 'w')
ca = h5f.createCArray(h5f.root, 'carray', atom, shape, filters=filters)
# Fill a hyperslab in ``ca``.
ca[10:60, 20:70] = numpy.ones((50, 50))
# Re-open a read another hyperslab
h5f = tables.openFile(fileName)
print h5f
print h5f.root.carray[8:12, 18:22]
The output for the previous script is something like::
carray1.h5 (File) ''
Last modif.: 'Thu Apr 12 10:15:38 2007'
Object Tree:
/ (RootGroup) ''
/carray (CArray(200, 300), shuffle, zlib(5)) ''
[[0 0 0 0]
[0 0 0 0]
[0 0 1 1]
[0 0 1 1]]
# Class identifier.
_c_classId = 'CARRAY'
# Properties
# ~~~~~~~~~~
# Special methods
# ~~~~~~~~~~~~~~~
def __init__( self, parentNode, name,
atom=None, shape=None,
title="", filters=None,
chunkshape=None, byteorder = None,
_log=True ):
Create a `CArray` instance.
An `Atom` instance representing the *type* and *shape* of
the atomic objects to be saved.
The shape of the new array.
A description for this node (it sets the ``TITLE`` HDF5
attribute on disk).
An instance of the `Filters` class that provides
information about the desired I/O filters to be applied
during the life of this object.
The shape of the data chunk to be read or written in a
single HDF5 I/O operation. Filters are applied to those
chunks of data. The dimensionality of `chunkshape` must
be the same as that of `shape`. If ``None``, a sensible
value is calculated (which is recommended).
The byteorder of the data *on disk*, specified as 'little'
or 'big'. If this is not specified, the byteorder is that
of the platform.
self.atom = atom
An `Atom` instance representing the shape, type of the atomic
objects to be saved.
self.shape = None
"""The shape of the stored array."""
self.extdim = -1 # `CArray` objects are not enlargeable by default
"""The index of the enlargeable dimension."""
# Other private attributes
self._v_version = None
"""The object version of this array."""
self._v_new = new = atom is not None
"""Is this the first time the node has been created?"""
self._v_new_title = title
"""New title for this node."""
self._v_convert = True
"""Whether the ``Array`` object must be converted or not."""
self._v_chunkshape = chunkshape
"""Private storage for the `chunkshape` property of the leaf."""
# Miscellaneous iteration rubbish.
self._start = None
"""Starting row for the current iteration."""
self._stop = None
"""Stopping row for the current iteration."""
self._step = None
"""Step size for the current iteration."""
self._nrowsread = None
"""Number of rows read up to the current state of iteration."""
self._startb = None
"""Starting row for current buffer."""
self._stopb = None
"""Stopping row for current buffer. """
self._row = None
"""Current row in iterators (sentinel)."""
self._init = False
"""Whether we are in the middle of an iteration or not (sentinel)."""
self.listarr = None
"""Current buffer in iterators."""
if new:
if not isinstance(atom, Atom):
raise ValueError, """\
atom parameter should be an instance of tables.Atom and you passed a %s.""" \
% type(atom)
if shape is None:
raise ValueError("you must specify a non-empty shape")
shape = tuple(shape)
except TypeError:
raise TypeError(
"`shape` parameter must be a sequence "
"and you passed a %s" % type(shape) )
self.shape = tuple(SizeType(s) for s in shape)
if chunkshape is not None:
chunkshape = tuple(chunkshape)
except TypeError:
raise TypeError(
"`chunkshape` parameter must be a sequence "
"and you passed a %s" % type(chunkshape) )
if len(shape) != len(chunkshape):
raise ValueError, """\
the shape (%s) and chunkshape (%s) ranks must be equal.""" \
% (shape, chunkshape)
elif min(chunkshape) < 1:
raise ValueError, """ \
chunkshape parameter cannot have zero-dimensions."""
self._v_chunkshape = tuple(SizeType(s) for s in chunkshape)
# The `Array` class is not abstract enough! :(
super(Array, self).__init__(parentNode, name, new, filters,
byteorder, _log)
def _g_create(self):
"""Create a new array in file (specific part)."""
if min(self.shape) < 1:
raise ValueError(
"shape parameter cannot have zero-dimensions.")
# Finish the common part of creation process
return self._g_create_common(self.nrows)
def _g_create_common(self, expectedrows):
"""Create a new array in file (common part)."""
self._v_version = obversion
if self._v_chunkshape is None:
# Compute the optimal chunk size
self._v_chunkshape = self._calc_chunkshape(
expectedrows, self.rowsize, self.atom.itemsize)
# Compute the optimal nrowsinbuf
self.nrowsinbuf = self._calc_nrowsinbuf(
self._v_chunkshape, self.rowsize, self.atom.itemsize)
# Correct the byteorder if needed
if self.byteorder is None:
self.byteorder = correct_byteorder(self.atom.type, sys.byteorder)
# ``self._v_objectID`` needs to be set because would be
# needed for setting attributes in some descendants later
# on
self._v_objectID = self._createCArray(self._v_new_title)
except: #XXX
# Problems creating the Array on disk. Close node and re-raise.
return self._v_objectID
def _g_copyWithStats(self, group, name, start, stop, step,
title, filters, chunkshape, _log, **kwargs):
"Private part of Leaf.copy() for each kind of leaf"
(start, stop, step) = self._processRangeRead(start, stop, step)
maindim = self.maindim
shape = list(self.shape)
shape[maindim] = lrange(start, stop, step).length
# Now, fill the new carray with values from source
nrowsinbuf = self.nrowsinbuf
# The slices parameter for self.__getitem__
slices = [slice(0, dim, 1) for dim in self.shape]
# This is a hack to prevent doing unnecessary conversions
# when copying buffers
self._v_convert = False
# Build the new CArray object
object = CArray(group, name, atom=self.atom, shape=shape,
title=title, filters=filters, chunkshape=chunkshape,
# Start the copy itself
for start2 in lrange(start, stop, step*nrowsinbuf):
# Save the records on disk
stop2 = start2 + step * nrowsinbuf
if stop2 > stop:
stop2 = stop
# Set the proper slice in the main dimension
slices[maindim] = slice(start2, stop2, step)
start3 = (start2-start)/step
stop3 = start3 + nrowsinbuf
if stop3 > shape[maindim]:
stop3 = shape[maindim]
# The next line should be generalised if, in the future,
# maindim is designed to be different from 0 in CArrays.
# See ticket #199.
object[start3:stop3] = self.__getitem__(tuple(slices))
# Activate the conversion again (default)
self._v_convert = True
nbytes = numpy.prod(self.shape, dtype=SizeType)*self.atom.itemsize
return (object, nbytes)