Source code for tokio.tools.lfsstatus

#!/usr/bin/env python
"""
Given a file system and a datetime, return summary statistics about the OST
fullness at that time
"""

import time
import datetime
import tokio.tools.common
import tokio.config
import tokio.connectors.nersc_lfsstate as nersc_lfsstate

DEFAULT_FULLNESS_PROVIDERS = ['hdf5', 'nersc_lfsstate']

[docs]def get_fullness(file_system, datetime_target, **kwargs):
    """Get file system fullness

    Is a convenience wrapper for `get_summary`.

    Args:
        file_system (str): Logical name of file system whose data should be
            retrieved (e.g., cscratch)
        datetime_target (datetime.datetime): Time at which requested data
            should be retrieved

    Returns:
        dict: various statistics about the file system fullness

    Raises:
        tokio.ConfigError: When no valid providers are found
    """
    providers = tokio.config.CONFIG.get('lfsstatus_fullness_providers', DEFAULT_FULLNESS_PROVIDERS)
    match = False
    fullness = {}
    for provider in providers:
        if provider == 'hdf5':
            match = True
            try:
                fullness = get_fullness_hdf5(file_system, datetime_target)
            except KeyError:
                # get_fullness_hdf5 throws KeyError if fullness data is not available
                match = False
            if fullness:
                return fullness
        if provider == 'nersc_lfsstate':
            match = True
            fsname = tokio.config.CONFIG.get('fsname_to_backend_name', {}).get(file_system)
            try:
                fullness = get_lfsstate(fsname if fsname else file_system,
                                        datetime_target,
                                        "fullness",
                                        **kwargs)
            except IOError:
                # get_lfsstate raises IOError if fullness data is not available
                match = False
            if fullness:
                return fullness

    if match:
        return fullness

    raise tokio.ConfigError("No valid lfsstatus fullness providers found")


[docs]def get_failures(file_system, datetime_target, **kwargs):
    """Get file system failures

    Is a convenience wrapper for `get_summary`.

    Args:
        file_system (str): Logical name of file system whose data should be
            retrieved (e.g., cscratch)
        datetime_target (datetime.datetime): Time at which requested data
            should be retrieved
        cache_file (str): Basename of file to search for the requested data

    Returns:
        dict: various statistics about the file system fullness
    """
    fsname = tokio.config.CONFIG.get('fsname_to_backend_name', {}).get(file_system)
    return get_lfsstate(fsname if fsname else file_system, datetime_target, "failures", **kwargs)


[docs]def get_fullness_lfsstate(file_system, datetime_target, cache_file=None):
    """Get file system fullness from nersc_lfsstate connector

    Wrapper around the generic get_lfsstate function.

    Args:
        file_system (str): Lustre file system name of the file system whose
            data should be retrieved (e.g., snx11025)
        datetime_target (datetime.datetime): Time at which requested data
            should be retrieved
        cache_file (str): Basename of file to search for the requested data

    Returns:
        Whatever is returned by :func:`tokio.tools.lfsstatus.get_lfsstate`
    """
    return get_lfsstate(file_system, datetime_target, 'fullness', cache_file)

[docs]def get_failures_lfsstate(file_system, datetime_target, cache_file=None):
    """Get file system failures from nersc_lfsstate connector

    Wrapper around the generic get_lfsstate function.

    Args:
        file_system (str): Lustre file system name of the file system whose
            data should be retrieved (e.g., snx11025)
        datetime_target (datetime.datetime): Time at which requested data
            should be retrieved
        cache_file (str): Basename of file to search for the requested data

    Returns:
        Whatever is returned by get_lfsstate
    """
    return get_lfsstate(file_system, datetime_target, 'failures', cache_file)

[docs]def get_lfsstate(file_system, datetime_target, metric, cache_file=None):
    """Get file system fullness or failures

    Given a file system name (e.g., snx11168) and a datetime object

        1. locate and load the lfs-df (fullness) or ost map (failures) file
        2. find the sample immediately preceding the datetime (don't find one
           that overlaps it
        3. return summary statistics about the OST fullness or OST failures

    Args:
        file_system (str): Lustre file system name of the file system whose
            data should be retrieved (e.g., snx11025)
        datetime_target (datetime.datetime): Time at which requested data
            should be retrieved
        metric (str): either "fullness" or "failures"
        cache_file (str): Basename of file to search for the requested data

    Returns:
        dict: various statistics about the file system fullness

    Raises:
        ValueError: if ``metric`` does not contain a valid option
        IOError: when no valid data sources can be found for the given date
    """
    if metric == "fullness":
        template_path = tokio.config.CONFIG['lfsstatus_fullness_files']
    elif metric == "failures":
        template_path = tokio.config.CONFIG['lfsstatus_map_files']
    else:
        raise ValueError("unknown metric " + metric)

    if cache_file is None:
        # We assume a 1 day lookbehind.  Very wasteful, but we index on dates in
        # local time using GMT-based unix times so we often need to look back to
        # the previous day's index.  The lookahead can be much more conservative
        # since it only needs to compensate for sampling intervals (15 min in
        # practice at NERSC)
        ost_health_files = tokio.tools.common.enumerate_dated_files(
            start=datetime_target - datetime.timedelta(days=1),
            end=datetime_target + datetime.timedelta(hours=1),
            template=template_path,
            match_first=True)

        for index, df_file in enumerate(ost_health_files):
            ost_health_files[index] = ost_health_files[index]
    else:
        ost_health_files = [cache_file]

    if not ost_health_files:
        raise IOError("No OST health files found in %s for %s" % (
            template_path,
            str(datetime_target)))

    # We can get away with the following because NerscLfsOstFullness,
    # NerscLfsOstMap, and NerscLfsOstMap.get_failovers all have the same
    # structure
    if metric == "fullness":
        ost_health = None
        for df_file in ost_health_files:
            if ost_health is None:
                ost_health = nersc_lfsstate.NerscLfsOstFullness(cache_file=df_file)
            else:
                ost_health.update(nersc_lfsstate.NerscLfsOstFullness(cache_file=df_file))
    elif metric == "failures":
        ost_map = None
        for map_file in ost_health_files:
            if ost_map is None:
                ost_map = nersc_lfsstate.NerscLfsOstMap(cache_file=map_file)
            else:
                ost_map.update(nersc_lfsstate.NerscLfsOstMap(cache_file=map_file))
        ost_health = ost_map.get_failovers()

    timestamps = sorted([int(x) for x in ost_health])

    # Unoptimized walk through to find our timestamp of interest
    target_timestamp = int(time.mktime(datetime_target.timetuple()))
    # If the day's records start after the target time stamp, just report the
    # first record (target_index=0)
    target_index = 0
    for index, timestamp in enumerate(timestamps):
        if timestamp >= target_timestamp:
            if index == 0:
                break
            else:
                target_index = index - 1
                break

    fs_data = ost_health[timestamps[target_index]][file_system]

    if metric == "fullness":
        results = _summarize_fullness(fs_data)
    if metric == "failures":
        results = _summarize_failover(fs_data)

    # In case you want to interpolate--hope is that we have enough data points
    # where OST volumes will not change significantly enough to require
    # interpolation
    if target_index < (len(timestamps) - 1):
        results['ost_next_timestamp'] = timestamps[target_index] + 1
    results.update({
        'ost_actual_timestamp': timestamps[target_index],
        'ost_requested_timestamp': target_timestamp,
    })
    return results


[docs]def get_fullness_hdf5(file_system, datetime_target):
    """Get file system fullness from an HDF5 object

    Given a file system name (e.g., snx11168) and a datetime object,
    return summary statistics about the OST fullness.

    Args:
        file_system (str): Name of file system whose data should be retrieved
        datetime_target (datetime.datetime): Time at which requested data
            should be retrieved

    Returns:
        dict: various statistics about the file system fullness

    Raises:
        ValueError: if an OST name is encountered which does not conform to
            a naming convention from which an OST index can be derived
    """
    # For concordance with the lfsstate version, assume a generous lookbehind
    # that captures at least a few timesteps.  Lookahead does not need to be as
    # generous.
    df_bytes = tokio.tools.hdf5.get_dataframe_from_time_range(
        fsname=file_system,
        dataset_name='/fullness/bytes',
        datetime_start=datetime_target - datetime.timedelta(hours=1),
        datetime_end=datetime_target + datetime.timedelta(hours=1))
    df_bytes_tot = tokio.tools.hdf5.get_dataframe_from_time_range(
        fsname=file_system,
        dataset_name='/fullness/bytestotal',
        datetime_start=datetime_target - datetime.timedelta(hours=1),
        datetime_end=datetime_target + datetime.timedelta(hours=1))

    # Bail if nothing is found
    if df_bytes is None or df_bytes_tot is None or len(df_bytes) == 0 or len(df_bytes_tot) == 0:
        return {}

    # NAs must be filled or else fullness/total values will be spuriously low
    df_bytes = df_bytes.fillna(method='ffill').fillna(method='bfill')
    df_bytes_tot = df_bytes_tot.fillna(method='ffill').fillna(method='bfill')

    # Find closest matching timestamp
    iloc = df_bytes.index.get_loc(datetime_target, method='nearest')

    # Build a dictionary that _summarize_fullness will accept as input
    results = {}
    for ostname, value in df_bytes.iloc[iloc].items():
        ostname_key = ostname.split('-')[-1]
        try:
            target_index = int(ostname_key.lower().lstrip('ost'), 16)
        except ValueError as error:
            raise type(error)("Cannot derive OST index from name '%s'" % ostname)
        used_kib = int(value / 1024.0)
        total_kib = int(df_bytes_tot.iloc[iloc][ostname] / 1024.0)
        results[ostname_key] = {
            'used_kib': used_kib,
            'total_kib': total_kib,
            'target_index': target_index,
        }

    summarized_data = _summarize_fullness(results)

    # for interpolation and error bounding
    if iloc < len(df_bytes.index):
        summarized_data['ost_next_timestamp'] = df_bytes.index[iloc + 1].to_pydatetime()

    summarized_data.update({
        'ost_actual_timestamp': df_bytes.index[iloc].to_pydatetime(),
        'ost_requested_timestamp': datetime_target,
    })

    return summarized_data


[docs]def _summarize_fullness(fs_data):
    """Summarize fullness data for a single time record

    Given an fs_data dict, generate a dict of summary statistics.  Expects
    fs_data dict of form generated by `nersc_lfsstate.NerscLfsOstFullness`::

        {
             "MDT0000": {
                 "mount_pt": "/scratch1",
                 "remaining_kib": 2147035984,
                 "target_index": 0,
                 "total_kib": 2255453580,
                 "used_kib": 74137712
             },
             "OST0000": {
                 "mount_pt": "/scratch1",
                 "remaining_kib": 28898576320,
                 "target_index": 0,
                 "total_kib": 90767651352,
                 "used_kib": 60894630700
             },
             ...
        }

    Args:
        fs_data (dict): a single timestamp and file system record taken from
            a nersc_lfsstate.NerscLfsOstFullness object

    Returns:
        dict: summary metrics about the state of the file system fullness
    """
    results = {
        'ost_least_full_kib': None,
        'ost_most_full_kib': 0,
        'ost_avg_full_kib': 0,
        'ost_avg_full_pct': 0,
        'ost_count': 0,
    }

    for ost_name, ost_data in fs_data.items():
        # Only care about OSTs, not MDTs or MGTs
        if not ost_name.lower().startswith('ost'):
            continue
        results['ost_count'] += 1
        results['ost_avg_full_kib'] += ost_data['used_kib']
        results['ost_avg_full_pct'] += ost_data['total_kib']
        if results['ost_least_full_kib'] is None \
        or results['ost_least_full_kib'] > ost_data['used_kib']:
            results['ost_least_full_kib'] = ost_data['used_kib']
            results['ost_least_full_name'] = ost_name
            results['ost_least_full_pct'] = 100.0 * ost_data['used_kib'] / ost_data['total_kib']

        if results['ost_most_full_kib'] < ost_data['used_kib']:
            results['ost_most_full_kib'] = ost_data['used_kib']
            results['ost_most_full_name'] = ost_name
            results['ost_most_full_pct'] = 100.0 * ost_data['used_kib'] / ost_data['total_kib']


    # If there are no osts, this will break
    try:
        results['ost_avg_full_kib'] = int(float(results['ost_avg_full_kib']) \
                                          / float(results['ost_count']))
        results['ost_avg_full_pct'] = 100.0 * float(results['ost_avg_full_kib']) \
                                      / float(results['ost_avg_full_pct']) \
                                      * float(results['ost_count'])
    except ZeroDivisionError:
        pass

    results['ost_least_full_id'] = fs_data[results['ost_least_full_name']]['target_index']
    results['ost_most_full_id'] = fs_data[results['ost_most_full_name']]['target_index']
    return results


[docs]def _summarize_failover(fs_data):
    """Summarize failover data for a single time record

    Given an fs_data dict, generate a dict of summary statistics. Expects
    fs_data dict of the form generated by parse_lustre_txt.get_failovers::

        {
            "abnormal_ips": {
                "10.100.104.140": [
                    "OST0087",
                    "OST0086",
                    ...
                ],
                "10.100.104.43": [
                    "OST0025",
                    "OST0024",
                    ...
                ]
            },
            "mode": 1
        }

    Args:
        fs_data (dict): a single timestamp and file system record taken from
            the output of `nersc_lfsstate.NerscLfsOstMap.get_failovers`

    Returns:
        dict: summary metrics about the state of failovers on the file system
    """
    num_abnormal_ip = len(fs_data['abnormal_ips'])
    num_abnormal_osts = 0
    if num_abnormal_ip:
        for _, ost_list in fs_data['abnormal_ips'].items():
            num_abnormal_osts += len(ost_list)
        avg_overload = float(num_abnormal_osts) / float(num_abnormal_ip)
        avg_overload_factor = avg_overload / float(fs_data['mode'])
    else:
        avg_overload = 0.0
        avg_overload_factor = 1.0

    return {
        'ost_overloaded_oss_count': num_abnormal_ip,
        'ost_overloaded_ost_count': num_abnormal_osts,
        'ost_avg_overloaded_ost_per_oss': avg_overload,
        'ost_avg_overloaded_overload_factor': avg_overload_factor,
    }