Source code for tokio.tools.hdf5

#!/usr/bin/env python
"""Retrieve data from TOKIO Time Series files using time as inputs

Provides a mapping between dates and times and a site's time-indexed repository
of TOKIO Time Series HDF5 files.
"""

import datetime
import warnings
import tokio.tools.common
import tokio.connectors.hdf5


[docs]def enumerate_h5lmts(fsname, datetime_start, datetime_end):
    """Alias for :meth:`tokio.tools.hdf5.enumerate_hdf5`"""
    return enumerate_hdf5(fsname, datetime_start, datetime_end)

[docs]def enumerate_hdf5(fsname, datetime_start, datetime_end):
    """Returns all time-indexed HDF5 files falling between a time range

    Given a starting and ending datetime, returns the names of all HDF5 files
    that should contain data falling within that date range (inclusive).

    Args:
        fsname (str): Logical file system name; should match a key within
            the ``hdf5_files`` config item in ``site.json``.
        datetime_start (datetime.datetime): Begin including files corresponding
            to this start date, inclusive.
        datetime_end (datetime.datetime): Stop including files with timestamps
            that follow this end date.  Resulting files _will_ include this
            date.

    Returns:
        list: List of strings, each describing a path to an existing HDF5 file
        that should contain data relevant to the requested start and end
        dates.
    """
    return tokio.tools.common.enumerate_dated_files(start=datetime_start,
                                                    end=datetime_end,
                                                    template=tokio.config.CONFIG['hdf5_files'],
                                                    lookup_key=fsname,
                                                    match_first=True)

[docs]def get_files_and_indices(fsname, dataset_name, datetime_start, datetime_end):
    """Retrieve filenames and indices within files corresponding to a date range

    Given a logical file system name and a dataset within that file system's
    TOKIO Time Series files, return a list of all file names and the indices
    within those files that fall within the specified date range.

    Args:
        fsname (str): Logical file system name; should match a key within
            the ``hdf5_files`` config item in ``site.json``.
        dataset_name (str): Name of a TOKIO Time Series dataset name
        datetime_start (datetime.datetime): Begin including files corresponding
            to this start date, inclusive.
        datetime_end (datetime.datetime): Stop including files with timestamps
            that follow this end date.  Resulting files _will_ include this
            date.

    Returns:
        list: List of three-item tuples of types (str, int, int), where

        * element 0 is the path to an existing HDF5 file
        * element 1 is the first index (inclusive) of ``dataset_name`` within
          that file containing data that falls within the specified date range
        * element 2 is the last index (exclusive) of ``dataset_name`` within
          that file containing data that falls within the specified date range

    """
    if datetime_end is None:
        datetime_end = datetime_start
    else:
        datetime_end = datetime_end
    h5lmt_files = enumerate_h5lmts(fsname, datetime_start, datetime_end)
    output = []

    for h5lmt_file in h5lmt_files:
        with tokio.connectors.hdf5.Hdf5(h5lmt_file, mode="r") as hdf5:
            i_0 = 0
            timestamps = hdf5.get_timestamps(dataset_name)
            if datetime.datetime.fromtimestamp(timestamps[0]) <= datetime_start:
                i_0 = hdf5.get_index(dataset_name, datetime_start) # This is the first day's hdf5

            i_f = -1
            if datetime.datetime.fromtimestamp(timestamps[-1]) >= datetime_end:
                # This is the last day's hdf5
                i_f = hdf5.get_index(dataset_name, datetime_end) - 1
                # -1 because datetime_end should be exclusive
                #
                # If the last timestamp is on the first datapoint of a new day,
                # just drop the whole day to maintain exclusivity of the last
                # timestamp
                if i_f < 0:
                    continue

        output.append((h5lmt_file, i_0, i_f))
    return output

[docs]def get_dataframe_from_time_range(fsname, dataset_name, datetime_start, datetime_end, fix_errors=False):
    """Returns all TOKIO Time Series data within a time range as a DataFrame.

    Given a time range,

    1. Find all TOKIO Time Series HDF5 files that exist and overlap with that time range
    2. Open each and load all data that falls within the given time range
    3. Convert loaded data into a single, time-indexed DataFrame

    Args:
        fsname (str): Name of file system whose data should be retrieved.
            Should exist as a key within ``tokio.config.CONFIG['hdf5_files']``
        dataset_name (str): Dataset within each matching HDF5 file to load
        datetime_start (datetime.datetime): Lower bound of time range to load,
            inclusive
        datetime_end (datetime.datetime): Upper bound of time range to load,
            exclusive
        fix_errors (bool): Replace negative values with -0.0.  Necessary if any
            HDF5 files contain negative values as a result of being archived
            with a buggy version of pytokio.

    Returns:
        pandas.DataFrame: DataFrame indexed in time and whose columns correspond
        to those in the given `dataset_name`.
    """
    result = None

    hdf5_filenames = enumerate_h5lmts(fsname, datetime_start, datetime_end)
    if not hdf5_filenames:
        return result

    for hdf_filename in hdf5_filenames:
        with tokio.connectors.hdf5.Hdf5(hdf_filename, mode='r') as hdf_file:
            df_slice = hdf_file.to_dataframe(dataset_name)
            df_slice = df_slice[(df_slice.index >= datetime_start)
                                & (df_slice.index < datetime_end)]
            if result is None:
                result = df_slice
            else:
                ### append a copy--I think this is memory-inefficient
                # result = result.append(df_slice)
                # concat ?
                ### append in place--maybe more efficient than .append??
                result = result.reindex(result.index.union(df_slice.index))
                result.loc[df_slice.index] = df_slice

    # Some versions of pytokio's archive_lmtdb were affected by a bug that could
    # produce negative numbers; this just drops those bad data points
    if fix_errors and result is not None:
        errors = (result < 0.0).sum().sum()
        if errors:
            result.mask(cond=lambda x: x < 0.0, other=-0.0, inplace=True)
            warnings.warn("Corrected %d errors" % errors)

    return result.sort_index()