leaf.py : » Database » PyTables » tables-2.1.2 » tables » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » Database » PyTables
PyTables » tables 2.1.2 » tables » leaf.py
########################################################################
#
#       License: BSD
#       Created: October 14, 2002
#       Author:  Francesc Alted - faltet@pytables.com
#
#       $Id: leaf.py 3905 2008-11-13 12:22:18Z faltet $
#
########################################################################

"""Here is defined the Leaf class.

See Leaf class docstring for more info.

Classes:

    Leaf

Functions:

    calc_chunksize

Misc variables:

    __version__


"""

import sys
import warnings
import math

import numpy

import tables
from tables.flavor import (check_flavor,internal_flavor
                            alias_map as flavor_alias_map )
from tables import hdf5Extension
from tables.node import Node
from tables.filters import Filters
from tables.utils import byteorders,idx2long,lazyattr,SizeType
from tables.utilsExtension import whichLibVersion
from tables.exceptions import PerformanceWarning
from tables import utilsExtension


__version__ = "$Revision: 3905 $"


def csformula(expectedsizeinMB):
    """Return the fitted chunksize for expectedsizeinMB."""
    # For a basesize of 8 KB, this will return:
    # 8 KB for datasets <= 1 MB
    # 1 MB for datasets >= 10 TB
    basesize = 8*1024   # 8 KB is a good minimum
    return basesize * int(2**math.log10(expectedsizeinMB))


def limit_es(expectedsizeinMB):
    """Protection against creating too small or too large chunks."""
    if expectedsizeinMB < 1:        # < 1 MB
        expectedsizeinMB = 1
    elif expectedsizeinMB > 10**7:  # > 10 TB
        expectedsizeinMB = 10**7
    return expectedsizeinMB


def calc_chunksize(expectedsizeinMB):
    """Compute the optimum HDF5 chunksize for I/O purposes.

    Rational: HDF5 takes the data in bunches of chunksize length to
    write the on disk. A BTree in memory is used to map structures on
    disk. The more chunks that are allocated for a dataset the larger
    the B-tree. Large B-trees take memory and causes file storage
    overhead as well as more disk I/O and higher contention for the meta
    data cache.  You have to balance between memory and I/O overhead
    (small B-trees) and time to access to data (big B-trees).

    The tuning of the chunksize parameter affects the performance and
    the memory consumed. This is based on my own experiments and, as
    always, your mileage may vary.
    """

    expectedsizeinMB = limit_es(expectedsizeinMB)
    zone = int(math.log10(expectedsizeinMB))
    expectedsizeinMB = 10**zone
    chunksize = csformula(expectedsizeinMB)
    return chunksize



class Leaf(Node):
    """
    Abstract base class for all PyTables leaves.

    A leaf is a node (see the `Node` class) which hangs from a group
    (see the `Group` class) but, unlike a group, it can not have any
    further children below it (i.e. it is an end node).

    This definition includes all nodes which contain actual data
    (datasets handled by the `Table`, `Array`, `CArray`, `EArray` and
    `VLArray` classes) and unsupported nodes (the `UnImplemented` class)
    --these classes do in fact inherit from `Leaf`.

    Public instance variables
    -------------------------

    The following instance variables are provided in addition to those
    in `Node`:

    byteorder
        The byte ordering of the leaf data *on disk*.
    chunkshape
        The HDF5 chunk size for chunked leaves (a tuple).

        This is read-only because you cannot change the chunk size of a
        leaf once it has been created.
    filters
        Filter properties for this leaf --see `Filters`.
    flavor
        The type of the data object read from this leaf.

        It can be any of 'numpy', 'numarray', 'numeric' or 'python' (the
        set of supported flavors depends on which packages you have
        installed on your system).

        You can (and are encouraged to) use this property to get, set
        and delete the ``FLAVOR`` HDF5 attribute of the leaf.  When the
        leaf has no such attribute, the default flavor is used.
    maindim
        The dimension along which iterators work.

        Its value is 0 (i.e. the first dimension) when the dataset is
        not extendable, and `Leaf.extdim` (where available) for
        extendable ones.
    nrows
        The length of the main dimension of the leaf data.
    nrowsinbuf
        The number of rows that fit in internal input buffers.

        You can change this to fine-tune the speed or memory
        requirements of your application.
    shape
        The shape of data in the leaf.

    Public instance variables -- aliases
    ------------------------------------

    The following instance variables are just easier-to-write aliases to
    their `Node` counterparts (indicated between parentheses):

    attrs
        The associated `AttributeSet` instance (`Node._v_attrs`).
    name
        The name of this node in its parent group (`Node._v_name`).
    objectID
        A node identifier (may change from run to run).
        (`Node._v_objectID`).
    title
        A description for this node (`Node._v_title`).

    Public methods
    --------------

    * close([flush])
    * copy([newparent][, newname][, overwrite][, createparents][, **kwargs])
    * delAttr(name)
    * flush()
    * getAttr(name)
    * isVisible()
    * move([newparent][, newname][, overwrite])
    * remove()
    * rename(newname)
    * truncate(size)
    * setAttr(name, value)
    * _f_close([flush])
    * __len__()
    """

    # Properties
    # ~~~~~~~~~~

    # Node property aliases
    # `````````````````````
    # These are a little hard to override, but so are properties.
    attrs = Node._v_attrs
    title = Node._v_title

    # Read-only node property aliases
    # ```````````````````````````````
    name = property(
        lambda self: self._v_name, None, None,
        "The name of this node in its parent group (a string)." )

    chunkshape = property(
        lambda self: self._v_chunkshape, None, None,
        """
        The HDF5 chunk size for chunked leaves (a tuple).

        This is read-only because you cannot change the chunk size of a
        leaf once it has been created.
        """ )

    objectID = property(
        lambda self: self._v_objectID, None, None,
        "A node identifier (may change from run to run)." )

    # Lazy read-only attributes
    # `````````````````````````
    @lazyattr
    def filters(self):
        """Filter properties for this leaf."""
        return Filters._from_leaf(self)

    # Other properties
    # ````````````````
    def _getmaindim(self):
        if self.extdim < 0:
            return 0  # choose the first dimension
        return self.extdim

    maindim = property(
        _getmaindim, None, None,
        """
        The dimension along which iterators work.

        Its value is 0 (i.e. the first dimension) when the dataset is
        not extendable, and `Leaf.extdim` (where available) for
        extendable ones.
        """ )

    def _setflavor(self, flavor):
        self._v_file._checkWritable()
        check_flavor(flavor)
        self._v_attrs.FLAVOR = self._flavor = flavor  # logs the change

    def _delflavor(self):
        del self._v_attrs.FLAVOR
        self._flavor = internal_flavor

    flavor = property(
        lambda self: self._flavor, _setflavor, _delflavor,
        """
        The representation of data read from this leaf.

        It can be any of 'numpy', 'numarray', 'numeric' or 'python' (the
        set of supported flavors depends on which packages you have
        installed on your system).

        You can (and are encouraged to) use this property to get, set
        and delete the ``FLAVOR`` HDF5 attribute of the leaf.  When the
        leaf has no such attribute, the default flavor is used.
        """ )


    # Special methods
    # ~~~~~~~~~~~~~~~
    def __init__(self, parentNode, name,
                 new=False, filters=None,
                 byteorder=None, _log=True):
        self._v_new = new
        """Is this the first time the node has been created?"""
        self.nrowsinbuf = None
        """
        The number of rows that fits in internal input buffers.

        You can change this to fine-tune the speed or memory
        requirements of your application.
        """
        self._flavor = None
        """Private storage for the `flavor` property."""

        if new:
            # Get filter properties from parent group if not given.
            if filters is None:
                filters = parentNode._v_filters
            self.__dict__['filters'] = filters  # bypass the property

            if byteorder not in (None, 'little', 'big'):
                raise ValueError(
                    "the byteorder can only take 'little' or 'big' values "
                    "and you passed: %s" % byteorder)
            self.byteorder = byteorder
            """The byte ordering of the leaf data *on disk*."""

        # Existing filters need not be read since `filters`
        # is a lazy property that automatically handles their loading.

        super(Leaf, self).__init__(parentNode, name, _log)


    def __len__(self):
        """
        Return the length of the main dimension of the leaf data.

        Please note that this may raise an ``OverflowError`` on 32-bit
        platforms for datasets having more than 2**31-1 rows.  This is a
        limitation of Python that you can work around by using the
        ``nrows`` or ``shape`` attributes.
        """
        return self.nrows


    def __str__(self):

        """The string representation for this object is its pathname in
        the HDF5 object tree plus some additional metainfo.
        """

        # Get this class name
        classname = self.__class__.__name__
        # The title
        title = self._v_title
        # The filters
        filters = ""
        if self.filters.fletcher32:
            filters += ", fletcher32"
        if self.filters.complevel:
            if self.filters.shuffle:
                filters += ", shuffle"
            filters += ", %s(%s)" % (self.filters.complib,
                                     self.filters.complevel)
        return "%s (%s%s%s) %r" % \
               (self._v_pathname, classname, self.shape, filters, title)


    # Private methods
    # ~~~~~~~~~~~~~~~
    def _g_postInitHook(self):
        """
        Code to be run after node creation and before creation logging.

        This method gets or sets the flavor of the leaf.
        """

        super(Leaf, self)._g_postInitHook()
        if self._v_new:  # set flavor of new node
            if self._flavor is None:
                self._flavor = internal_flavor
            else:  # flavor set at creation time, do not log
                if self._v_file.params['PYTABLES_SYS_ATTRS']:
                    self._v_attrs._g__setattr('FLAVOR', self._flavor)
        else:  # get flavor of existing node (if any)
            if self._v_file.params['PYTABLES_SYS_ATTRS']:
                flavor = getattr(self._v_attrs, 'FLAVOR', internal_flavor)
                self._flavor = flavor_alias_map.get(flavor, flavor)
            else:
                self._flavor = internal_flavor


    def _calc_chunkshape(self, expectedrows, rowsize, itemsize):
        """Calculate the shape for the HDF5 chunk."""

        # In case of a scalar shape, return the unit chunksize
        if self.shape == ():
            return (SizeType(1),)

        # Compute the chunksize
        MB = 1024 * 1024
        expectedsizeinMB = (expectedrows * rowsize) / MB
        chunksize = calc_chunksize(expectedsizeinMB)

        maindim = self.maindim
        # Compute the chunknitems
        chunknitems = chunksize // itemsize
        # Safeguard against itemsizes being extremely large
        if chunknitems == 0:
            chunknitems = 1
        chunkshape = list(self.shape)
        # Check whether trimming the main dimension is enough
        chunkshape[maindim] = 1
        newchunknitems = numpy.prod(chunkshape, dtype=SizeType)
        if newchunknitems <= chunknitems:
            chunkshape[maindim] = chunknitems // newchunknitems
        else:
            # No, so start trimming other dimensions as well
            for j in xrange(len(chunkshape)):
                # Check whether trimming this dimension is enough
                chunkshape[j] = 1
                newchunknitems = numpy.prod(chunkshape, dtype=SizeType)
                if newchunknitems <= chunknitems:
                    chunkshape[j] = chunknitems // newchunknitems
                    break
            else:
                # Ops, we ran out of the loop without a break
                # Set the last dimension to chunknitems
                chunkshape[-1] = chunknitems

        return tuple(SizeType(s) for s in chunkshape)


    def _calc_nrowsinbuf(self, chunkshape, rowsize, itemsize):
        """Calculate the number of rows that fits on a PyTables buffer."""

        params = self._v_file.params
        # Compute the nrowsinbuf
        chunksize = numpy.prod(chunkshape, dtype=SizeType) * itemsize
        buffersize = chunksize * params['CHUNKTIMES']
        nrowsinbuf = buffersize // rowsize
        # Safeguard against row sizes being extremely large
        if nrowsinbuf == 0:
            nrowsinbuf = 1
            # If rowsize is too large, issue a Performance warning
            maxrowsize = params['BUFFERTIMES'] * buffersize
            if rowsize > maxrowsize:
                warnings.warn("""\
The Leaf ``%s`` is exceeding the maximum recommended rowsize (%d bytes);
be ready to see PyTables asking for *lots* of memory and possibly slow
I/O.  You may want to reduce the rowsize by trimming the value of
dimensions that are orthogonal (and preferably close) to the main
dimension of this leave.  Alternatively, in case you have specified a
very small/large chunksize, you may want to increase/decrease it."""
                              % (self._v_pathname, maxrowsize),
                                 PerformanceWarning)
# It is difficult to forsee the level of code nesting to reach user code.
#f = sys._getframe(8)
###Caller --> %s (%s:%s)"""
#                    f.f_code.co_name,
#                    f.f_code.co_filename, f.f_lineno,),
        return nrowsinbuf


    # This method is appropriate for calls to __getitem__ methods
    def _processRange(self, start, stop, step, dim=None, warn_negstep=True):
        if dim is None:
            nrows = len(self)  # self.shape[self.maindim]
        else:
            nrows = self.shape[dim]

        if warn_negstep and step and step < 0 :
            raise ValueError("slice step cannot be negative")
        # In order to convert possible numpy.integer values to long ones
        # F. Alted 2006-05-02
        if start is not None: start = idx2long(start)
        if stop is not None: stop = idx2long(stop)
        if step is not None: step = idx2long(step)
        # (start, stop, step) = slice(start, stop, step).indices(nrows)
        # The next function is a substitute for slice().indices in order to
        # support full 64-bit integer for slices even in 32-bit machines.
        # F. Alted 2005-05-08
        (start, stop, step) = utilsExtension.getIndices(
            slice(start, stop, step), long(nrows) )

        return (start, stop, step)


    # This method is appropiate for calls to read() methods
    def _processRangeRead(self, start, stop, step, warn_negstep=True):
        nrows = self.nrows
        if start is None and stop is None:
            start = 0
            stop = nrows
        if start is not None and stop is None:
            # Protection against start greater than available records
            # nrows == 0 is a special case for empty objects
            if nrows > 0 and start >= nrows:
                raise IndexError( "start of range (%s) is greater than "
                                  "number of rows (%s)" % (start, nrows) )
            step = 1
            if start == -1:  # corner case
                stop = nrows
            else:
                stop = start + 1
        # Finally, get the correct values (over the main dimension)
        start, stop, step = self._processRange(
            start, stop, step, warn_negstep=warn_negstep)

        return (start, stop, step)


    def _g_copy(self, newParent, newName, recursive, _log=True, **kwargs):
        # Compute default arguments.
        start = kwargs.pop('start', None)
        stop = kwargs.pop('stop', None)
        step = kwargs.pop('step', None)
        title = kwargs.pop('title', self._v_title)
        filters = kwargs.pop('filters', self.filters)
        chunkshape = kwargs.pop('chunkshape', self.chunkshape)
        copyuserattrs = kwargs.pop('copyuserattrs', True)
        stats = kwargs.pop('stats', None)
        if chunkshape == 'keep':
            chunkshape = self.chunkshape  # Keep the original chunkshape
        elif chunkshape == 'auto':
            chunkshape = None             # Will recompute chunkshape

        # Fix arguments with explicit None values for backwards compatibility.
        if title is None:  title = self._v_title
        if filters is None:  filters = self.filters

        # Create a copy of the object.
        (newNode, bytes) = self._g_copyWithStats(
            newParent, newName, start, stop, step,
            title, filters, chunkshape, _log, **kwargs)

        # Copy user attributes if requested (or the flavor at least).
        if copyuserattrs == True:
            self._v_attrs._g_copy(newNode._v_attrs)
        elif 'FLAVOR' in self._v_attrs:
            if self._v_file.params['PYTABLES_SYS_ATTRS']:
                newNode._v_attrs._g__setattr('FLAVOR', self._flavor)
        newNode._flavor = self._flavor  # update cached value

        # Update statistics if needed.
        if stats is not None:
            stats['leaves'] += 1
            stats['bytes'] += bytes

        return newNode


    def _g_fix_byteorder_data(self, data, dbyteorder):
        "Fix the byteorder of data passed in constructors."
        dbyteorder = byteorders[dbyteorder]
        # If self.byteorder has not been passed as an argument of
        # the constructor, then set it to the same value of data.
        if self.byteorder is None:
            self.byteorder = dbyteorder
        # Do an additional in-place byteswap of data if the in-memory
        # byteorder doesn't match that of the on-disk.  This is the only
        # place that we have to do the conversion manually. In all the
        # other cases, it will be HDF5 the responsible of doing the
        # byteswap properly.
        if dbyteorder in ['little', 'big']:
            if dbyteorder != self.byteorder:
                # if data is not writeable, do a copy first
                if not data.flags.writeable:
                    data = data.copy()
                data.byteswap(True)
        else:
            # Fix the byteorder again, no matter which byteorder have
            # specified the user in the constructor.
            self.byteorder = "irrelevant"
        return data


    # Public methods
    # ~~~~~~~~~~~~~~
    # Tree manipulation
    # `````````````````
    def remove(self):
        """
        Remove this node from thehierarchy. import 

        This method has the behavior described in `Node._f_remove()`.
        Please note that there is no ``recursive`` flag since leaves
        do not have child nodes.
        """
        self._f_remove(False)


    def rename(self, newname):
        """
        Rename this node in place.

        This method has the behavior described in `Node._f_rename()`.
        """
        self._f_rename(newname)


    def move( self, newparent=None, newname=None,
              overwrite=False, createparents=False ):
        """
        Move or rename this node.

        This method has the behavior described in `Node._f_move()`.
        """
        self._f_move(newparent, newname, overwrite, createparents)


    def copy( self, newparent=None, newname=None,
              overwrite=False, createparents=False, **kwargs ):
        """
        Copy this node and return the new one.

        This method has the behavior described in `Node._f_copy()`.
        Please note that there is no ``recursive`` flag since leaves
        do not have child nodes.  In addition, this method recognises
        the following keyword arguments:

        `title`
            The new title for the destination.  If omitted or
            ``None``, the original title is used.
        `filters`
            Specifying this parameter overrides the original filter
            properties in the source node.  If specified, it must be
            an instance of the `Filters` class.  The default is to
            copy the filter properties from thesourcenode. import 
        `copyuserattrs`
            You can prevent the user attributes from beingcopiedby import 
            setting this parameter to ``False``.  The default is to
            copy them.
        `start`, `stop`, `step`
            Specify the range of rows to be copied; the default is to
            copy all the rows.
        `chunkshape`
            The chunkshape of the new leaf.  It supports a couple of
            special values.  A value of 'keep' means that the chunkshape
            will be the same than original leaf (this is the default).
            A value of 'auto' means that a new shape will be computed
            automatically in order to ensure best performance when
            accessing the dataset through the main dimension.  Any other
            value should be an integer or a tuple matching the
            dimensions of the leaf.
        `stats`
            This argument may be used to collect statistics on the
            copy process.  When used, it should be a dictionary whith
            keys ``'groups'``, ``'leaves'`` and ``'bytes'`` having a
            numeric value.  Their values will be incremented to
            reflect the number of groups, leaves and bytes,
            respectively, that have been copied during the operation.

        .. Warning:: Note that unknown parameters passed to this method
           will be ignored, so may want to double check the spell of
           these (i.e. if you write them incorrectly, they will most
           probably be ignored).
        """
        return self._f_copy(
            newparent, newname, overwrite, createparents, **kwargs )


    def truncate(self, size):
        """Truncate the main dimension to be `size` rows.

        If the main dimension previously was larger than this `size`,
        the extra data is lost.  If the main dimension previously was
        shorter, it is extended, and the extended part is filled with
        the default values.

        The truncation operation can only be applied to *enlargeable*
        datasets, else a `TypeError` will be raised.

        .. Warning:: If you are using the HDF5 1.6.x series, and due to
           limitations of them, `size` must be greater than zero
           (i.e. the dataset can not be completely emptied).  A
           `ValueError` will be issued if you are using HDF5 1.6.x and
           try to pass a zero size to this method.  HDF5 1.8.x doesn't
           undergo this problem.
        """
        # A non-enlargeable arrays (Array, CArray) cannot be truncated
        if self.extdim < 0:
            raise TypeError("non-enlargeable datasets cannot be truncated")
        if (size > 0 or
            (size == 0 and whichLibVersion("hdf5")[1] >= "1.8.0")):
                self._g_truncate(size)
        else:
            raise ValueError("""
`size` must be greater than 0 if you are using HDF5 < 1.8.0.
With HDF5 1.8.0 and higher, `size` can also be 0 or greater.""")


    def isVisible(self):
        """
        Is this node visible?

        This method has the behavior described in `Node._f_isVisible()`.
        """
        return self._f_isVisible()


    # Attribute handling
    # ``````````````````
    def getAttr(self, name):
        """
        Get a PyTables attribute from thisnode. import 

        This method has the behavior described in `Node._f_getAttr()`.
        """
        return self._f_getAttr(name)


    def setAttr(self, name, value):
        """
        Set a PyTables attribute for this node.

        This method has the behavior described in `Node._f_setAttr()`.
        """
        self._f_setAttr(name, value)


    def delAttr(self, name):
        """
        Delete a PyTables attribute from thisnode. import 

        This method has the behavior described in `Node._f_delAttr()`.
        """
        self._f_delAttr(name)


    # Data handling
    # `````````````
    def flush(self):
        """
        Flush pending data to disk.

        Saves whatever remaining buffered data to disk.  It also
        releases I/O buffers, so if you are filling many datasets in the
        same PyTables session, please call ``flush()`` extensively so as
        to help PyTables to keep memory requirements low.
        """
        self._g_flush()


    def _f_close(self, flush=True):
        """
        Close this node in the tree.

        This method has the behavior described in `Node._f_close()`.
        Besides that, the optional argument `flush` tells whether to
        flush pending data to disk or not before closing.
        """

        if not self._v_isopen:
            return  # the node is already closed or not initialized

        # Only do a flush in case the leaf has an IO buffer.  The
        # internal buffers of HDF5 will be flushed afterwards during the
        # self._g_close() call.  Avoiding an unnecessary flush()
        # operation accelerates the closing for the unbuffered leaves.
        if flush and hasattr(self, "_v_iobuf"):
            self.flush()

        # Close the dataset and release resources
        self._g_close()

        # Close myself as a node.
        super(Leaf, self)._f_close()


    def close(self, flush=True):
        """
        Close this node in the tree.

        This method is completely equivalent to `Leaf._f_close()`.
        """
        self._f_close(flush)



## Local Variables:
## mode: python
## py-indent-offset: 4
## tab-width: 4
## fill-column: 72
## End:
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.