Source code for sensortoolkit.calculate._regression_stats

# -*- coding: utf-8 -*-
"""
This module computes statistical metrics for the Ordinary Least Squares linear
regression between sensor and FRM/FEM datasets (FRM/FEM as the dependent
variable along the x-axis and sensor data as the independent variable along
the y-axis).

Bias and linearity
------------------

U.S. EPA's Performance Targets Reports recommend using a linear regression
model, relating sensor and FRM/FEM measurements, to determine the magnitude
of bias and linearity. The regression model takes the form

.. math::

    y = mx + b

where

    :math:`y` = sensor measurements

    :math:`x` = FRM/FEM measurements

    :math:`m` = regression slope

    :math:`y` = regression intercept


The slope :math:`m` and intercept :math:`y` indicate the degree of systematic
bias between sensor and reference measurements.

Linearity is measured via the coefficient of determination (:math:`R^2`).

===============================================================================

@Author:
  | Samuel Frederick, NSSC Contractor (ORAU)
  | U.S. EPA / ORD / CEMM / AMCD / SFSB

Created:
  Tue Mar 3 13:47:32 2020
Last Updated:
  Tue Jul 13 11:09:13 2021
"""
import os
import sys
import numpy as np
import pandas as pd
from sensortoolkit.datetime_utils import get_todays_date
from sensortoolkit.lib_utils import check_type


[docs]def regression_stats(sensor_df_obj, ref_df_obj, deploy_dict, param, serials,
                     verbose=True):
    """Compute OLS regression statistics.

    Module is used to compute the following regressions:

    - Sensor vs. FRM/FEM
    - Sensor vs. Inter-sensor average

    For each instance, the dependent and independent variables are assigned as
    **hourly/daily sensor data vs. hourly/daily reference data**; please note
    the ``ref_df_obj`` object can be either a DataFrame containing FRM/FEM
    concentrations, or a DataFrame containing intersensor averages depending on
    the use case. The 'ref' label refers more so to the fact that the dataset is
    used as the independent variable for regressions.

    .. note::

      The DataFrames within the ``sensor_df_obj`` and ``ref_df_obj`` arguments
      should contain data reported at the same sampling frequency (e.g., if a
      sensor DataFrame containing data at 1-hour averaged intervals is passed
      to the ``sensor_df_obj``, the reference DataFrame passed to
      ``ref_df_obj`` must also contain data at 1-hour averaged intervals).

    Args:
        sensor_df_obj (pandas DataFrame or list of pandas DataFrames):
            Either a DataFrame or list of DataFrames containg sensor parameter
            measurements. Data corresponding to passed parameter name are used
            as the dependent variable.
        ref_df_obj (pandas DataFrame):
            Reference DataFrame (either FRM/FEM OR Inter-sensor averages). Data
            corresponding to passed parameter name are used as the independent
            variable.
        deploy_dict (dict):
            A dictionary containing descriptive statistics and
            textual information about the deployment (testing agency, site,
            time period, etc.), sensors tested, and site conditions during the
            evaluation.
        param (str):
            Parameter name for which to compute regression statistics.
        serials (dict):
            A dictionary of sensor serial identifiers for each unit
            in a testing group.
        verbose (bool):
            If true, print statements will be displayed in the console output.
            Defaults to True.

    Returns:
        stats_df (pandas DataFrame):
            Statistics DataFrame for either sensor vs. FRM/FEM or sensor vs.
            intersensor mean OLS regression.

    """

    stats_cols = ['Sensor Name', 'Sensor_Number', 'Sensor_Serial',
                  'Averaging Interval', 'Param', 'Reference', 'R$^2$',
                  'Slope', 'Intercept', 'Sensor RMSE', 'N', 'Sensor_Min',
                  'Sensor_Max', 'Sensor_Mean']

    stats_df = pd.DataFrame(columns=stats_cols)

    sensor_name = deploy_dict['Sensor Name']
    sensor_nums = list(serials.keys())

    sensor_dtype = check_type(sensor_df_obj,
                             accept_types=[pd.core.frame.DataFrame, list])
    check_type(ref_df_obj, accept_types=[pd.core.frame.DataFrame])

    if sensor_dtype is pd.core.frame.DataFrame:
        df_list = [sensor_df_obj]
    if sensor_dtype is list:
        df_list = sensor_df_obj

    # Loop over dataframes for sensors, compute regression stats
    for i, (df, sensor_num) in enumerate(zip(df_list, sensor_nums)):

        ydata = df[param + '_Value']
        if 'mean_' + param + '_Value' in ref_df_obj:
            xdata = ref_df_obj['mean_' + param + '_Value']
            ref_name = 'Intersensor_mean_' + param
        else:
            xdata = ref_df_obj[param + '_Value']
            ref_name = ''.join(
                        list(ref_df_obj[param + '_Method'].dropna().apply(
                                        lambda x: str(x)).unique()[0]))

        # Locate the union of non-nan sensor (y) and reference (x) data
        # False if either is nan and true if both are finite
        # idx = np.isfinite(xdata) & np.isfinite(ydata)

        # X = pd.Series(xdata)
        # Y = pd.Series(ydata)

        combine_df = pd.DataFrame()
        combine_df['ref_data'] = xdata
        combine_df['sensor_data'] = ydata
        combine_df = combine_df.dropna()


        # Determine the averaging interval for the supplied dataset
        if (df.index[1] - df.index[0]) == pd.Timedelta('1 days'):
            avg_int = '24-hour'
        if (df.index[1] - df.index[0]) == pd.Timedelta('1 hour'):
            avg_int = '1-hour'

        if i == 0 and verbose:
            print('Computing {0} regression statistics for {1} vs.'
                  ' {2}'.format(avg_int, sensor_name, ref_name))

        if verbose:
            print('..{0}'.format(list(serials.values())[i]))

        if combine_df.empty:
            print('..Warning: Linear regression not possible for sensor '
                  + str(sensor_num) + '. Sensor or reference data are '
                  'null.')
            r_square = slope = intercept = rmse = N = np.nan
            param_min = param_max = param_mean = np.nan
        else:
            N = int(len(combine_df))

            if N <= 2:
                print('..Warning: Linear regression not possible for' +
                      ' sensor ' + str(sensor_num) +
                      '. Insufficient number of data points')
                r_square = slope = intercept = rmse = N = np.nan
                param_min = param_max = param_mean = np.nan

            else:
                # Compute linear regress. for the union of finite x and y data
                fit = np.polyfit(combine_df.ref_data, combine_df.sensor_data, 1)
                slope = fit[0]
                intercept = fit[1]
                pearson_coeff = combine_df.ref_data.corr(combine_df.sensor_data, method='pearson')
                r_square = pearson_coeff*pearson_coeff

                sensor_val = combine_df.sensor_data
                ref_val = combine_df.ref_data
                MSE = (1/N)*sum((sensor_val-ref_val)**2)
                rmse = np.sqrt(MSE)

                param_min = combine_df.sensor_data.min()
                param_max = combine_df.sensor_data.max()
                param_mean = combine_df.sensor_data.mean()

        serial_id = serials[str(sensor_num)]

        row_df = pd.DataFrame([sensor_name,
                               sensor_num,
                               serial_id,
                               avg_int,
                               param,
                               ref_name,
                               r_square,
                               slope,
                               intercept,
                               rmse,
                               N,
                               param_min,
                               param_max,
                               param_mean])

        row_df.index = stats_df.columns
        row_df = row_df.T

        stats_df = pd.concat([stats_df, row_df]).reset_index(drop=True)

    # Add metric averages if multiple sensor datasets passed to function
    if sensor_dtype is list:
        metric_list = ['R$^2$', 'Slope', 'Intercept', 'Sensor RMSE',
                       'N', 'Sensor_Min', 'Sensor_Max', 'Sensor_Mean']
        avg_data = stats_df.where(stats_df['Averaging Interval'] == avg_int
                                  ).dropna(how='all', axis=0)

        avg_df = pd.DataFrame(np.nan, columns=stats_cols, index=[0])
        avg_df['Reference'] = 'Metric Average:'

        for col_i, metric in enumerate(metric_list, 6):
            metric_data = avg_data.iloc[:, col_i]
            metric_mean = metric_data.mean()
            avg_df.iloc[0, col_i] = metric_mean

        stats_df = pd.concat([stats_df, avg_df]).reset_index(drop=True)

        stats_df[stats_df.columns[6:]] = stats_df[stats_df.columns[6:]
                                                  ].astype('float64')

    return stats_df


[docs]def join_stats(hourly_stats, daily_stats, write_to_file=False, stats_path=None,
               stats_type='individual'):
    """Combine 1-hour and 24-hour regression statistics DataFrames.

    Args:
        hourly_stats (pandas DataFrame):
            DataFrame containing 1-hour averaged
            sensor vs. reference regression statistics, returned by call to
            regression_stats().
        daily_stats (pandas DataFrame):
            DataFrame containing 24-hour averaged
            sensor vs. reference regression statistics, returned by call to
            regression_stats().
        write_to_file (bool, optional):
            If True, the merged statistics dataframe will be written to a csv
            file. Defaults to False.
        stats_path (str, optional):
            The full path to the statistics directory where the dataset will be
            saved if ``write_to_file`` is True. Defaults to None.
        stats_type (str, optional):
            The type of regression statistics dataframes that are being joined
            ('individual' indicates individual sensor vs. reference
            regression, 'average' indicates sensor vs. intersensor average
            regression statistics)

    Returns:
        stats_df (pandas DataFrame):
            DataFrame containing both 1-hour and 24-hour averaged statistics.

    """
    if stats_type not in ['average', 'individual']:
        raise ValueError(f'Invalid value {stats_type} passed to stats_type. '
                         'Choose from either "individual" or "average"')

    if stats_type == 'individual':
        stats_type = 'stats_df'
    if stats_type == 'average':
        stats_type = 'avg_stats_df'

    stats_df = hourly_stats.append(daily_stats)

    ref_name = stats_df.Reference[stats_df.Reference != 'Metric Average:'
                                  ].value_counts().keys()[0]
    sensor_name = stats_df['Sensor Name'].value_counts().keys()[0]
    param = stats_df['Param'].value_counts().keys()[0]

    # Save the statistics DataFrame
    if write_to_file is True:
        if stats_path is None:
            sys.exit('No directory path specified for saving DataFrame')

        today = get_todays_date()
        ref_name = ref_name.replace(' ', '_')

        filename = '_'.join([sensor_name, param, 'vs', ref_name, stats_type,
                             today])
        filename += '.csv'

        dataframe_to_csv(stats_df,
                         parent_path=stats_path,
                         filename=filename)

    return stats_df


[docs]def dataframe_to_csv(obj, parent_path, filename, **kwargs):
    """Save a pandas DataFrame to a comma-separated value file.

    Args:
        obj (pandas DataFrame): The DataFrame to write to a .csv file.
        parent_path (str): The path to the folder where the file will be saved.
        filename (str): The name of the resulting .csv file.
        **kwargs (dict): Keyword arguments passed to the pandas ``to_csv()``
            method.

    Returns:
        None.

    """
    check_type(obj, accept_types=[pd.core.frame.DataFrame])
    filepath = os.path.join(parent_path, filename)
    print('..Saving dataset to the following path: {0}'.format(filepath))
    obj.to_csv(filepath, index=kwargs.get('index', False))