Source code for energy_balance.netcdf.base_netcdf

__author__ = 'Elle Smith'
__date__ = '09 Aug 2021'
__contact__ = 'eleanor.smith@stfc.ac.uk'

import pandas as pd
from netCDF4 import Dataset
from datetime import datetime
import numpy as np
import os

from energy_balance import CONFIG

[docs]class BaseNetCDF: """ Base class used for creating netCDF files. Creates all the common variables found in netCDF files under the NCAS-GENERAL Data Standard. Sets all the required global attributes. Constant values are taken from the config file, excluding 'headers' and 'data_product' which must be set in each specific implementation. :param df: A pandas dataframe containing all columns required to create the netCDF file. :param qc: A pandas dataframe with the same columns as df, but containing the quality control values instead. (i.e. 1, 2, 3 etc.) :param date: (datetime.datetime) The date to create the netCDF file for. If frequency is monthly, only the year and month will be taken into account. :param frequency: (str) 'daily' or 'monthly'. Determines whether the file will use data from one day or for one month. """ dt_header = CONFIG['common']['datetime_header'] headers = 'UNDEFINED' data_product = "UNDEFINED" qc_flag_level = CONFIG['common']['qc_flag_level'] fill_value = CONFIG['common']['fill_value'] def __init__(self, df, qc, date, frequency): self.df = df self.qc = qc self.get_masked_data(self.qc_flag_level) # validate date format date = self.convert_date_to_string(date, frequency) output_file_name = f"ncas-energy-balance-1_{CONFIG['global']['platform']}_{date}_{self.data_product}_v{CONFIG['global']['product_version']}.nc" output_path = CONFIG['common']['netcdf_path'] output_file = os.path.expanduser(os.path.join(output_path, output_file_name)) self.dataset = Dataset(output_file, "w", format='NETCDF4_CLASSIC') self.create_netcdf() self.dataset.close() print(f"Dataset created at {output_file}")
[docs] def convert_date_to_string(self, date, frequency): """ Generate a date string for the file name based on the date provided and the frequency required. :param date: (datetime.datetime) The date to convert to string. :param frequency: (str) The frequency at which to have the date string. :returns: (str) The date now converted to string format. """ if frequency == "monthly": date = date.strftime("%Y%m") elif frequency == "daily": date = date.strftime("%Y%m%d") else: raise ValueError(f'Frequency {self.frequency} is not supported. Options are daily or monthly.') return date
[docs] @staticmethod def convert_times(times): """ Convert times from strings to total seconds since 1970-01-01T00:00:00. :param times: (sequence) Times to convert to total seconds since 1970-01-01T00:00:00. :returns: (list) The times converted to total seconds since 1970-01-01T00:00:00. """ ref_time = datetime.strptime("1970-01-01T00:00:00", "%Y-%m-%dT%H:%M:%S") time_values = [] for t in times: t = datetime.strptime(t, "%Y-%m-%d %H:%M:%S") diff = t - ref_time ts = diff.total_seconds() time_values.append(ts) return time_values
[docs] @staticmethod def times_as_datetimes(times): """ Convert times from strings to datetimes. :param times: (sequence) Times to convert to datetimes in format Y-m-d H:M:S. :returns: (list) The times converted to datetimes. """ datetimes = [] for t in times: t = datetime.strptime(t, "%Y-%m-%d %H:%M:%S") datetimes.append(t) return datetimes
[docs] def get_masked_data(self, mask_value): """ Create masked pandas dataframe based on self.qc and the qc flag requested. Sets self.df_masked. :param mask_value: (int) Max value of qc to show i.e. 1 will show only 'good data', 2 will show good data and data marked with a flag of 2. """ self.mask = (self.qc <= mask_value) self.df_masked = pd.DataFrame(columns = self.headers) self.df_masked[self.dt_header] = self.df[self.dt_header] for col in self.headers: mask_column = self.mask[col+'_qc'] self.df_masked[col] = self.df[col][mask_column]
[docs] def create_time_variable(self): """ Create the common time variable. """ time_units = "seconds since 1970-01-01 00:00:00" time_var = self.dataset.createVariable("time", np.float64, ("time",)) self.times = self.df[self.dt_header] time_var[:] = self.convert_times(self.times) time_var.units = time_units time_var.standard_name = "time" time_var.calendar = "standard" time_var.axis = 'T' time_var.long_name = "Time (seconds since 1970-01-01 00:00:00)" time_var.valid_min = time_var[:].min() time_var.valid_max = time_var[:].max()
[docs] def create_lon_variable(self): """ Create the common longitude variable. """ lon_var = self.dataset.createVariable("longitude", np.float32, ("longitude",)) lon_var[:] = CONFIG['common']['longitude_value'] lon_var.units = "degrees_east" lon_var.standard_name = "longitude" lon_var.long_name = "Longitude"
[docs] def create_lat_variable(self): """ Create the common latitude variable. """ lat_var = self.dataset.createVariable("latitude", np.float32, ("latitude",)) lat_var[:] = CONFIG['common']['latitude_value'] lat_var.units = "degrees_north" lat_var.standard_name = "latitude" lat_var.long_name = "Latitude"
[docs] def create_variable(self, name, data_type, dims, header, **kwargs): """ Generic method to create a variable in the netCDF4 dataset. :param name: (str) The name of the variable to be created. :param data_type: The data type of the variable to be created e.g. numpy.float32 :param dims: (tuple) The dimensions of the variable to be created e.g. ('time', ) or ('time', 'index') :param header: (str) The name of the column in the pandas dataframe to use to populate the data of this variable. :param kwargs: (dict) Dictionary of attributes {'attr_name': 'attr_value'} to set on the variable e.g. {'standard_name': 'soil_temperature'} """ # Create variable var = self.dataset.createVariable(name, data_type, dims, fill_value=self.fill_value) # convert any nan values to fill values self.df[header][np.isnan(self.df[header])] = self.fill_value var[:] = self.df[header] # mask the data according to the qc var_masked = self.df_masked[header].astype(data_type) # Set variable attributes var.valid_min = np.nanmin(var_masked) # get from valid values var.valid_max = np.nanmax(var_masked) for k, v in kwargs.items(): setattr(var, k, v)
[docs] def create_qc_variable(self, name, header, dimensions, **kwargs): """ Generic method to create a qc variable on the dataset. :param name: (str) The name of the variable to be created. :param header: (str) The name of the column in the df pandas dataframe to use to populate the data of this variable. :param dimensions: (tuple) The dimensions of the variable to be created e.g. ('time', ) or ('time', 'index') :param kwargs: (dict) Dictionary of attributes {'attr_name': 'attr_value'} to set on the variable e.g. {'standard_name': 'soil_temperature'} """ var = self.dataset.createVariable(name, np.byte, dimensions) qc_header = header + '_qc' var[:] = self.qc[qc_header] var.units = "1" for k, v in kwargs.items(): setattr(var, k, v)
[docs] def set_global_attributes(self): """ Sets the global attributes in the dataset based on those listed in the config file. """ for k, v in CONFIG['global'].items(): setattr(self.dataset, k, v) self.dataset.last_revised_date = datetime.utcnow().isoformat() self.dataset.time_coverage_start = datetime.strptime(self.times.iloc[0], "%Y-%m-%d %H:%M:%S").isoformat() self.dataset.time_coverage_end = datetime.strptime(self.times.iloc[-1], "%Y-%m-%d %H:%M:%S").isoformat()
[docs] def create_specific_dimensions(self): """ Class specific implementation to create dimensions specific to that data product. """ raise NotImplementedError
[docs] def create_specific_variables(self): """ Class specific implementation to create variables specific to that data product, including any qc variables. """ raise NotImplementedError
[docs] def create_netcdf(self): """ Method to create the netCDF dataset """ # Create the time dimension - with unlimited length self.dataset.createDimension("time", None) # Create the latitude dimension - with length 1 as stationary self.dataset.createDimension("latitude", 1) # Create the longitude dimension - with length 1 as stationary self.dataset.createDimension("longitude", 1) # create basic variables self.create_time_variable() self.create_lat_variable() self.create_lon_variable() # create other variables e.g. year of day datetimes = self.times_as_datetimes(self.times) # day of year doy_vals = [t.timetuple().tm_yday for t in datetimes] self.create_time_related_variable("day_of_year", np.float32, doy_vals, "Day of Year") # year year_vals = [t.year for t in datetimes] self.create_time_related_variable("year", np.int32, year_vals, "Year") # month month_vals = [t.month for t in datetimes] self.create_time_related_variable("month", np.int32, month_vals, "Month") # day day_vals = [t.day for t in datetimes] self.create_time_related_variable("day", np.int32, day_vals, "Day") # hour hour_vals = [t.hour for t in datetimes] self.create_time_related_variable("hour", np.int32, hour_vals, "Hour") # minute minute_vals = [t.minute for t in datetimes] self.create_time_related_variable("minute", np.int32, minute_vals, "Minute") # second second_vals = [t.second for t in datetimes] self.create_time_related_variable("second", np.float32, second_vals, "Second") # create specific dimensions and variables self.create_specific_dimensions() self.create_specific_variables() # set global attributes self.set_global_attributes()