Source code for energy_balance.scripts.download_data_by_date

#!/usr/bin/env python

__author__ = 'Elle Smith'
__date__ = '23 Jul 2021'
__contact__ = 'eleanor.smith@stfc.ac.uk'


import argparse
import os
import pandas as pd
from datetime import datetime, timedelta
from pycampbellcr1000 import CR1000
from energy_balance import CONFIG
import time


def arg_parse():
    parser = argparse.ArgumentParser()

    parser.add_argument('-s', '--start-date',
                        type=str,
                        required=True,
                        help="The start date to extract data for, in the format YYYY-MM-DD.")

    parser.add_argument('-e', '--end-date',
                        type=str,
                        required=False,
                        help="The end date to extract data for, in the format YYYY-MM-DD.")
                        
    return parser.parse_args()


[docs]def get_data(url, start_date, end_date, dir_path):
    """
    Extract data from the campbell data logger for each specified table and save to a daily csv file between the date ranges specified.
    Default tables are: Housekeeping, GPS_datetime, SoilTemperature, SoilMoisture, SoilHeatFlux and Radiation
    
    :param url: (str) URL for connection with logger in format 'tcp:iphost:port' or 'serial:/dev/ttyUSB0:19200:8N1'
    :param start_date: (datetime.datetime) The start date from which to collect data
    :param end_date: (datetime.datetime) The end date after which to stop collecting data. (the end date will be included in the data.) 
    :param dir_path: (str) The path to the top level directory in which to create the csv files and folders.
    :returns: None
    """
    device = CR1000.from_url(url)

    # device.list_tables():
    # ['Status', 'Housekeeping', 'GPS_datetime', 'SoilTemperature', 'SoilMoisture', 'SoilHeatFlux', 'Radiation', 'DataTableInfo', 'Public']
    tables = CONFIG['common']['logger_tables']

    while start_date <= end_date:

        for table in tables:

            end_of_day = start_date + timedelta(hours=23, minutes=59, seconds=59, microseconds=59)

            csv_dirs = os.path.join(dir_path, table)
            csv_name = f"{table}_{start_date.strftime('%Y-%m-%d')}.csv"
            csv_path = os.path.join(csv_dirs, csv_name)

            # create csv path
            if not os.path.exists(csv_dirs):
                os.makedirs(csv_dirs)

            # if file doesn't exist - make it
            if not os.path.isfile(csv_path):
                open(csv_path, 'w').close()

            # open the csv file
            try:
                df = pd.read_csv(csv_path)
            except pd.errors.EmptyDataError:
                get_data_from_range(device, table, csv_path, start_date, end_of_day, header=True)
                continue

            if df.empty:
                get_data_from_range(device, table, csv_path, start_date, end_of_day, header=False)

            else:
                # if the file already has data in it, it will just update the existing data from latest entry onwards
                # get lastest date before updating - add a microsecond on, so no issues with duplication of record
                latest = datetime.strptime(df['Datetime'].iloc[-1], "%Y-%m-%d %H:%M:%S") + timedelta(microseconds=1)
                get_data_from_range(device, table, csv_path, latest, end_of_day, header=False)

            print(f"Completed for {table} for {start_date.strftime('%Y-%m-%d')}")
            time.sleep(3)

        start_date = start_date + timedelta(1)


[docs]def get_data_from_range(device, table, csv_path, start, end, header):
    """ 
    Gets range of data specified by start and end dates and saves to csv at the path specified.

    :param device: (pycampbellcr1000.CR1000 object) URL for connection with logger in format 'tcp:iphost:port' or 'serial:/dev/ttyUSB0:19200:8N1'
    :param table: (str) The name of the table on the logger from which the data is being extracted.
    :param csv_path: (str) The path to the csv file to back fill with todays data.
    :param start: (datetime.datetime) The start datetime from which to collect data
    :param end: (datetime.datetime) The end datetime after which to stop collecting data. (end will be included in the data.) 
    :returns: None
    """
    data = device.get_data(table, start, end)
    content = data.to_csv(header=header)

    with open(csv_path, "a") as f:
        f.write(content)


def main():
    args = arg_parse()
    start_date = datetime.strptime(args.start_date, "%Y-%m-%d")

    # if no end date, make it the same as the start date, then data will be downloaded for one day only
    if args.end_date:
        end_date = datetime.strptime(args.end_date, "%Y-%m-%d")
        complete_stmnt = f'Data downloaded for range {args.start_date} {args.end_date}'
    else:
        end_date = start_date
        complete_stmnt = f'Data downloaded for {args.start_date}'

    url = CONFIG['common']['logger_url']
    dir_path = os.path.expanduser(CONFIG['common']['logger_csv_path'])

    get_data(url, start_date, end_date, dir_path)
    print(complete_stmnt)

if __name__ == '__main__':
    main()