Source code for energy_balance.netcdf.quality_control

__author__ = 'Elle Smith'
__date__ = '09 Aug 2021'
__contact__ = 'eleanor.smith@stfc.ac.uk'

import pandas as pd
import numpy as np
import os
from datetime import datetime
from energy_balance import CONFIG

# make this more general

[docs]class QualityControl: """ Base class used for apply quality control to data in pandas data frames. Creates a quality control dataframe and a masked dataframe (the initial data with a quality control mask applied) from input csv files. The input files and various options are taken from a config file. Constant values are taken from the config file, excluding 'headers' which must be set in each specific implementation. :param date: (datetime.datetime) The date to do the QC for. If frequency is monthly, only the year and month will be taken into account. :param frequency: (str) 'daily' or 'monthly'. Determines whether one days worth of data, or one months worth is taken from the csv files to create the dataframes. """ dt_header = CONFIG['common']['datetime_header'] headers = 'UNDEFINED' qc_flag_level = CONFIG['common']['qc_flag_level'] def __init__(self, date, frequency): self.date = date self.frequency = frequency self.execute_qc()
[docs] def prepare_date(self, input_date_format): """ Prepares the input date format so it matches with the frequency requested. :param input_date_format: (str) The format in which the date is provided in the input csv files. :returns: (str) The date now converted to string format. """ if self.frequency == 'monthly' and 'd' in input_date_format: # remove day part from input date format input_date_format = input_date_format.replace('%d', '').rstrip('-/').lstrip('-/').replace('//', '/').replace('--', '-') + '*' elif self.frequency == 'daily': if 'd' not in input_date_format: raise ValueError(f'Input date format does not specify a day, so daily files can not be created.') else: raise ValueError(f'Frequency {self.frequency} is not supported. Options are daily or monthly.') date = self.date.strftime(input_date_format) return date
[docs] def create_dataframes(self): """ Class specific implementation to create pandas dataframe from input csv and empty QC dataframe other than column names. Sets self._df and self._qc """ # set self._df and self._qc raise NotImplementedError
[docs] def apply_qc(self, conditions, choices, col): """ Generic method to apply QC to a column in a dataframe, new column is created in QC dataframe. :param conditions: (list) The conditions at which a QC flag should be applied. e.g. [np.isnan(self._df[col]), self._df[col] < -35, self._df[col] > 50] :param choices: (list) The QC flag to be applied, corresponds to conditions. e.g. [2, 2, 2] :param col: (str) The name of the column to apply QC to e.g. 'WP_kPa_1' """ col_qc = col + '_qc' self._qc[col_qc] = np.select(conditions, choices, default=1)
[docs] def qc_variables(self): """ Class specific implementation to apply QC to all columns. """ # make use of apply_qc raise NotImplementedError
[docs] def create_masked_df(self, qc_flag): """ Create masked pandas dataframe based on self._qc and the qc flag requested. Sets self._df_masked. :param qc_flag: (int) Max value of qc to show i.e. 1 will show only 'good data', 2 will show good data and data marked with a flag of 2. """ self.mask = (self._qc <= qc_flag) self._df_masked = pd.DataFrame(columns = [self.dt_header] + self.headers) self._df_masked[self.dt_header] = self._df[self.dt_header] for col in self.headers: mask_column = self.mask[col+'_qc'] self._df_masked[col] = self._df[col][mask_column]
[docs] def execute_qc(self): """ Create the dataframes, apply the QC and create the masked dataframe. """ self.create_dataframes() self.qc_variables() self.create_masked_df(self.qc_flag_level)
[docs] def create_masked_csv(self, file_path): """ Create a csv file from the masked dataframe. :param file_path: (str) The path at which to create the csv file e.g. /path/to/my/file.csv """ self._df_masked.to_csv(file_path, index=False) self._masked_csv = file_path
@property def df(self): """ Returns the original dataframe created from the input csv files. All headers set in each class implementaiton of self.headers are included. """ return self._df @property def df_masked(self): """ Returns the original dataframe masked following QC. """ return self._df_masked @property def qc(self): """ Returns the QC dataframe created based on conditions and choices set in the qc_variables method. """ return self._qc