lib_era5_land_pixel.py

# -*- coding: UTF-8 -*-
# Python
"""
Functions to call ECMWF Reanalysis with CDS-api

- ERA5-land daily request
- request a list of daily variables dedicated to the calculus of ET0
 and the generation of MODSPA daily forcing files
 
 heavily modified from @rivallandv's original file

@author: auclairj
"""

import os  # for path exploration and file management
from typing import List, Tuple  # to declare variables
import numpy as np  # for math on arrays
import xarray as xr  # to manage nc files
from datetime import datetime  # to manage dates
from p_tqdm import p_map  # for multiprocessing with progress bars
from dateutil.rrule import rrule, MONTHLY
from fnmatch import fnmatch  # for file name matching
import pandas as pd  # to manage dataframes
import rasterio  as rio  # to manage geotiff images
import geopandas as gpd  # to manage shapefile crs projections
from rasterio.mask import mask  # to mask images
from shapely.geometry import box  # to extract parcel statistics
import netCDF4 as nc  # to write netcdf4 files
from tqdm import tqdm  # to follow progress
from multiprocessing import Pool  # to parallelize reprojection
from psutil import virtual_memory  # to check available ram
from modspa_pixel.config.config import config  # to import config file
from modspa_pixel.source.modspa_samir import calculate_time_slices_to_load  # to optimise I/O operations
import re  # for string comparison
import warnings  # to suppress pandas warning

# CDS API external library
# source: https://pypi.org/project/cdsapi/
import cdsapi  # to download cds data
import requests  # to request data

# FAO ET0 calculator external library
# Notes
# source: https://github.com/Evapotranspiration/ETo
# documentation: https://eto.readthedocs.io/en/latest/
import eto  # to calculate ET0


def era5_enclosing_shp_aera(area: List[float], pas: float) -> Tuple[float, float, float, float]:
    """
    Find the four coordinates including the boxbound scene
    to agree with gridsize resolution
    system projection: WGS84 lat/lon degree
   
    Arguments
    =========
       
    1. area: ``List[float]``
        bounding box of the demanded area
        list of floats: [lat north, lon west, lat south, lon east] in degree WGS84
    2. pas: ``float``
        gridsize
        
    Returns
    =======
    
    1. era5_area: ``Tuple[float, float, float, float]``
        coordinates list corresponding to N,W,S,E corners of the grid in decimal degree
        
    .. note:: 
    
        gdal coordinates reference upper left corner of pixel, ERA5 coordinates refere to center of grid. To resolve this difference an offset of pas/2 is apply
        
    """
    
    lat_max, lon_min, lat_min, lon_max = area

    # North
    era5_lat_max = round((lat_max//pas+1)*pas, 2)
    # West
    era5_lon_min = round((lon_min//pas)*pas, 2)
    # South
    era5_lat_min = round((lat_min//pas)*pas, 2)
    # Est
    era5_lon_max = round((lon_max//pas+1)*pas, 2)

    era5_area = era5_lat_max, era5_lon_min, era5_lat_min, era5_lon_max

    return era5_area  # [N,W,S,E]


def call_era5land_daily(args: Tuple[str, str, str, str, List[int], str]) -> None:
    """
    Query of one month of daily ERA5-land data of a selected variable
    according to a selected statistic

    Documentation:
    `cds_climate <https://datastore.copernicus-climate.eu/documents/app-c3s-daily-era5-statistics/C3S_Application-Documentation_ERA5-daily-statistics-v2.pdf>`_


    Arguments
    =========
    
    (packed in args: ``tuple``)
    
    1. year: ``str``
        year at YYYY format.
    2. month: ``str``
        month at MM format.
    3. variable: ``str``
        user-selectable variable
        cf. Appendix A Table 3 for list of input variables availables.
    4. statistic: ``str``
        daily statistic choosed, 3 possibility
        daily_mean or daily_minimum or daily_maximum.
    5. area: ``List[int]``
        bounding box of the demanded area
        area = [lat_max, lon_min, lat_min, lon_max]
    6. output_path: ``str``
        path for output file.

    Returns
    =======
    
    ``None``
    """
    year, month, variable, statistic, area, output_path = args
    
    # set name of output file for each month (statistic, variable, year, month)
    output_filename = \
        output_path+os.sep +\
        "ERA5-land_"+year+"_"+month+"_"+variable+"_"+statistic+".nc"

    if os.path.isfile(output_filename):
        print(output_filename, ' already exist')
    else:
        try:

            c = cdsapi.Client(timeout=300)

            result = c.service("tool.toolbox.orchestrator.workflow",
                               params={
                                   "realm": "c3s",
                                   "project": "app-c3s-daily-era5-statistics",
                                   "version": "master",
                                   "kwargs": {
                                       "dataset": "reanalysis-era5-land",
                                       "product_type": "reanalysis",
                                       "variable": variable,
                                       "statistic": statistic,
                                       "year": year,
                                       "month": month,
                                       "time_zone": "UTC+00:0",
                                       "frequency": "1-hourly",
                                       "grid": "0.1/0.1",
                                       "area": {"lat": [area[2], area[0]],
                                                "lon": [area[1], area[3]]}
                                   },
                                   "workflow_name": "application"
                               })

            location = result[0]['location']
            res = requests.get(location, stream=True)
            print("Writing data to " + output_filename)
            with open(output_filename, 'wb') as fh:
                for r in res.iter_content(chunk_size=1024):
                    fh.write(r)
            fh.close()
        except:
            print('!! request', variable, '  failed !! -> year', year, 'month', month)
            
    return None


def call_era5land_daily_for_MODSPA(start_date: str, end_date: str, area: List[float], output_path: str, processes: int = 9) -> None:
    """
    request ERA5-land daily variables needed for ET0 calculus and MODSPA forcing
    `reanalysis_era5  <https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-land?tab=overview>`_

    Information on requested variables
    ----------------------------------
    
    called land surface variables :
        * **2m_temperature**
        * **2m_dewpoint_temperature**
        * **surface_solar_radiation_downward**
        * **surface_net_solar_radiation**
        * **surface_pressure**
        * **mean_sea_level_pressure**
        * **potential_evaporation**
        * **evaporation**
        * **total_evaporation**
        * **total_precipitation**
        * **snowfall**
        * **10m_u_component_of_wind**
        * **10m_v_component_of_wind**

    Arguments
    =========
    
    1. start_date: ``str``
        start date in YYYY-MM-DD format
    2. end_date: ``str``
        end date in YYYY-MM-DD format
    3. area: ``List[float]``
        bounding box of the demanded area
        area = [lat_max, lon_min, lat_min, lon_max]
    4. output_path: ``str``
        output file name, ``.nc`` extension
    5. processes: ``int`` ``default = 9``
        number of logical processors on which to run the download command.
        can be higher than your actual number of processor cores,
        download operations have a low CPU demand.

    Returns
    =======
    
    ``None``
    """

    # list of first day of each month date into period
    strt_dt = datetime.strptime(start_date, '%Y-%m-%d').replace(day=1)
    end_dt = datetime.strptime(end_date, '%Y-%m-%d').replace(day=1)

    periods = [dt for dt in rrule(
        freq=MONTHLY, dtstart=strt_dt, until=end_dt, bymonthday=1)]

    dico = {
        '2m_temperature': ['daily_minimum', 'daily_maximum'],
        '10m_u_component_of_wind': ['daily_mean'],
        '10m_v_component_of_wind': ['daily_mean'],
        'total_precipitation': ['daily_mean'],
        'surface_solar_radiation_downwards': ['daily_mean'],
        '2m_dewpoint_temperature': ['daily_minimum', 'daily_maximum']
    }

    args = []
    
    # loop on variable to upload
    for variable in dico.keys():
        # loop on statistic associated to variable to upload
        for statistic in dico[variable]:
            # loop on year and month
            for dt in periods:
                year = str(dt.year)
                month = '0'+str(dt.month)
                month = month[-2:]
                # Requete ERA5-land
                args.append((year, month, variable, statistic, area, output_path))
                
    # Start pool
    p_map(call_era5land_daily, args, **{"num_cpus": processes})
    
    return None


def filename_to_datetime(filename: str) -> datetime.date:
    """
    filename_to_datetime returns a ``datetime.date`` object for the date of the given file name.

    Arguments
    =========

    1. filename: ``str``
        name or path of the product

    Returns
    =======

    1. date: ``datetime.date``
        datetime.date object, date of the product
    """

    # Search for a date pattern (yyyy_mm_dd) in the product name or path
    match = re.search('\d{4}_\d{2}', filename)
    format = '%Y_%m'
    datetime_object = datetime.strptime(match[0], format)
    return datetime_object.date()


def concat_monthly_nc_file(list_era5land_monthly_files: List[str], list_variables: List[str], output_path: str) -> List[str]:
    """
    Concatenate monthly netcdf datasets into a single file for each given variable.

    Arguments
    =========

    1. list_era5land_monthly_files: ``List[str]``
        list of daily files per month
    2. list_variables: ``List[str]``
        names of the required variables as written in the filename
    3. output_path: ``List[str]``
        path to which save the aggregated files

    Returns
    =======

    1. list_era5land_files: ``List[str]``
        the list of paths to the aggregated files
    """
    
    if not os.path.exists(output_path): os.mkdir(output_path)
    
    list_era5land_monthly_files.sort()
    
    list_era5land_files = []
    
    # concatenate all dates into a single file for each variable
    for variable in list_variables:
        curr_var_list = []
        dates = []
        for file in list_era5land_monthly_files:
            # find specific variable
            if fnmatch(file, '*' + variable + '*'):
                curr_var_list.append(file)
                dates.append(filename_to_datetime(file))
        
        curr_datasets = []
        for file in curr_var_list:
            # open all months for the given variable
            curr_datasets.append(xr.open_dataset(file))

        # Create file name
        try:
            concatenated_file = output_path + os.sep + 'era5-land_' + dates[0].strftime('%m-%Y') + '_' + dates[-1].strftime('%m-%Y') + '_' + variable + '.nc'
        except:
            print(variable)
        
        # Concatenate monthly datasets
        concatenated_dataset = xr.concat(curr_datasets, dim = 'time')
        
        # Save datasets
        concatenated_dataset.to_netcdf(path = concatenated_file, mode = 'w',)
        
        # Add filename to output list
        list_era5land_files.append(concatenated_file)
    
    return list_era5land_files


def uz_to_u2(u_z: List[float], h: float) -> List[float]:
    """
    The wind speed measured at heights other than 2 m can be adjusted according
    to the follow equation

    Arguments
    ----------
    u_z : TYPE float array
        measured wind speed z m above the ground surface, ms- 1.
    h : TYPE float
        height of the measurement above the ground surface, m.

    Returns
    -------
    u2 : TYPE float array
        average daily wind speed in meters per second (ms- 1 ) measured at 2 m above the ground.
    """

    u2 = u_z*4.87/(np.log(67.8*h - 5.42))
    return u2


def ea_calc(T: float) -> float:
    """
    comments
    Actual vapour pressure (ea) derived from dewpoint temperature '
    
    Arguments
    ----------
    T : Temperature in degree celsius.

    Returns
    -------
    e_a :the actual Vapour pressure in Kpa
    """

    e_a = 0.6108*np.exp(17.27*T/(T+237.15))
    return e_a


def load_variable(file_name: str) -> xr.Dataset:
    """
    Loads an ERA5 meteorological variable into a xarray
    dataset according to the modspa architecture.

    Arguments
    =========

    1. file_name: ``str``
        netcdf file to load

    Returns
    =======

    1. variable: ``xr.Dataset``
        output xarray dataset
    """
    
    # Rename temperature variables according to the statistic (max or min)
    if fnmatch(file_name, '*era5-land*2m_temperature_daily_maximum*'):  # maximum temperature
        variable = xr.open_dataset(file_name).rename({'t2m': 't2m_max'}).drop_vars('realization')  # netcdfs from ERA5 carry an unecessary 'realization' coordinate, so it is dropped 
        
    elif fnmatch(file_name, '*era5-land*2m_temperature_daily_minimum*'):  # minimum temperature
        variable = xr.open_dataset(file_name).rename({'t2m': 't2m_min'}).drop_vars('realization')
        
    elif fnmatch(file_name, '*era5-land*2m_dewpoint_temperature_daily_maximum*'):  # maximum dewpoint temperature
        variable = xr.open_dataset(file_name).rename({'d2m': 'd2m_max'}).drop_vars('realization')
        
    elif fnmatch(file_name, '*era5-land*2m_dewpoint_temperature_daily_minimum*'):  # minimum temperature
        variable = xr.open_dataset(file_name).rename({'d2m': 'd2m_min'}).drop_vars('realization')
        
    # Other variables can be loaded without modification
    else:
        variable = xr.open_dataset(file_name).drop_vars('realization')
    
    return variable


def combine_weather2netcdf(rain_file: str, ET0_tile: str, ndvi_path: str, save_path: str, available_ram: int) -> None:
    """
    Convert the Rain and ET0 geotiffs into a single weather netcdf dataset.

    Arguments
    =========

    1. rain_file: ``str``
        path to Rain tif
    2. ET0_tile: ``str``
        path to ET0 tif
    3. ndvi_path: ``str``
        path to ndvi cube
    4. save_path: ``str``
        save path of weather netcdf dataset
    5. available_ram: ``int``
        available ram in GiB for conversion

    Returns
    =======

    ``None``
    """
    
    # Open tif files
    rain_tif = rio.open(rain_file)
    ET0_tif = rio.open(ET0_tile)
    
    # Open ndvi netcdf to get structure
    ndvi = xr.open_dataset(ndvi_path)
    
    # Create empty dataset with same structure
    weather = ndvi.drop_vars(['ndvi']).copy(deep = True)
    
    weather['Rain'] = (ndvi.dims, np.zeros(tuple(ndvi.dims[d] for d in list(ndvi.dims)), dtype = np.uint16))
    weather['Rain'].attrs['units'] = 'mm'
    weather['Rain'].attrs['standard_name'] = 'total_precipitation'
    weather['Rain'].attrs['description'] = 'Accumulated daily precipitation in mm'
    weather['Rain'].attrs['scale factor'] = '1000'

    weather['ET0'] = (ndvi.dims, np.zeros(tuple(ndvi.dims[d] for d in list(ndvi.dims)), dtype = np.uint16))
    weather['ET0'].attrs['units'] = 'mm'
    weather['ET0'].attrs['standard_name'] = 'Transpiration'
    weather['ET0'].attrs['description'] = 'Accumulated daily reference evapotranspiration in mm'
    weather['ET0'].attrs['scale factor'] = '1000'
    
    # Create encoding dictionnary
    for variable in list(weather.keys()):
        # Write encoding dict
        encoding_dict = {}
        encod = {}
        encod['dtype'] = 'u2'
        # encod['zlib'] = True
        # encod['complevel'] = 4
        encoding_dict[variable] = encod

    # Save empty output
    print('\nWriting empty weather dataset...')
    weather.to_netcdf(save_path, encoding=encoding_dict)
    weather.close()

    # Get geotiff dimensions (time, x, y)
    dims = (rain_tif.count, rain_tif.height, rain_tif.width)
    
    # Determine the memory requirement of operation
    nb_bits = 2  # int16
    nb_vars = 1  # one variable written at a time
    memory_requirement = ((dims[0] * dims[1] * dims[2]) * nb_vars * nb_bits) / (1024**3)  # in GiB
    security_factor = 0.8  # it is difficult to estimate true memory usage with compression algorithms, apply a security factor to prevent memory overload
    
    # Get the number of time bands that can be loaded at once
    time_slice, remainder, already_loaded = calculate_time_slices_to_load(memory_requirement, dims[0], security_factor, available_ram)
    
    print('\nApproximate memory requirement of conversion:', round(memory_requirement, 3), 'GiB\nAvailable memory:', available_ram, 'GiB\n\nLoading', time_slice, 'time slices at a time.\n' )
    
    # Open empty dataset
    weather = nc.Dataset(save_path, mode = 'r+')
    
    # Create progress bar
    progress_bar = tqdm(total = dims[0], desc='Writing weather data', unit=' bands')

    # Data variables
    for i in range(dims[0]):
        
        if time_slice == dims[0] and not already_loaded:  # if whole dataset fits in memory and it has not already been loaded
            
            weather.variables['Rain'][:,:,:] = rain_tif.read()
            weather.variables['ET0'][:,:,:] = ET0_tif.read()
            already_loaded = True
        
        elif i % time_slice == 0:  # load a time slice every time i is divisible by the size of the time slice
            if i + time_slice <= dims[0]:  # if the time slice does not gow over the dataset size
                
                weather.variables['Rain'][i: i + time_slice, :, :] = rain_tif.read(tuple(k+1 for k in range(i, i + time_slice)))
                weather.variables['ET0'][i: i + time_slice, :, :] = ET0_tif.read(tuple(k+1 for k in range(i, i + time_slice)))
            
            else:  # load the remainder when the time slice would go over the dataset size
                
                weather.variables['Rain'][i: i + remainder, :, :] = rain_tif.read(tuple(k+1 for k in range(i, i + remainder)))
                weather.variables['ET0'][i: i + remainder, :, :] = ET0_tif.read(tuple(k+1 for k in range(i, i + remainder)))
        
        progress_bar.update()
    
    progress_bar.close()

    rain_tif.close()
    ET0_tif.close()
    weather.close()
    
    return None


def calculate_ET0_pixel(pixel_dataset: xr.Dataset, lat: float, lon: float, h: float = 10) -> np.ndarray:
    """
    Calculate ET0 over the year for a single pixel of the ERA5 weather dataset.

    Arguments
    =========

    1. pixel_dataset: ``xr.Dataset``
        extracted dataset that contains all information for a single pixel
    2. lat: ``float``
        latitudinal coordinate of that pixel
    3. lon: ``float``
        longitudinal coordinate of that pixel
    4. h: ``float`` ``default = 10``
        height of ERA5 wind measurement in meters

    Returns
    =======

    1. ET0_values: ``np.ndarray``
        numpy array containing the ET0 values for each day
    """
    
    # Conversion of xarray dataset to dataframe for ET0 calculation
    ET0 = pixel_dataset.d2m_max.to_dataframe().rename(columns = {'d2m_max' : 'Dew_Point_T_max'}) - 273.15  # conversion of temperatures from K to °C

    ET0['Dew_Point_T_min'] = pixel_dataset.d2m_min.to_dataframe()['d2m_min'].values - 273.15  # conversion of temperatures from K to °C
    ET0['T_min'] = pixel_dataset.t2m_min.to_dataframe()['t2m_min'].values - 273.15  # conversion of temperatures from K to °C
    ET0['T_max'] = pixel_dataset.t2m_max.to_dataframe()['t2m_max'].values - 273.15  # conversion of temperatures from K to °C
    
    ET0['Rain'] = pixel_dataset.tp.to_dataframe()['tp'].values*1000  # conversion of total precipitation from meters to milimeters
    
    # Conversion of easward and northward wind values to scalar wind
    ET0['U_z'] =  np.sqrt(pixel_dataset.u10.to_dataframe()['u10'].values**2 + pixel_dataset.v10.to_dataframe()['v10'].values**2)
    
    ET0['RH_max'] =  100 * ea_calc(ET0['Dew_Point_T_min']) / ea_calc(ET0['T_min'])  # calculation of relative humidity from dew point temperature and temperature
    ET0['RH_min'] =  100 * ea_calc(ET0['Dew_Point_T_max']) / ea_calc(ET0['T_max'])  # calculation of relative humidity from dew point temperature and temperature
    
    ET0['R_s'] = pixel_dataset.ssrd.to_dataframe()['ssrd'].values/1e6  # to convert downward total radiation from J/m² to MJ/m²

    ET0.drop(columns = ['Dew_Point_T_max', 'Dew_Point_T_min'], inplace = True)  # drop unecessary columns
    
    # Start ET0 calculation
    eto_calc = eto.ETo()
    warnings.filterwarnings('ignore')  # remove pandas warning

    # ET0 calculation for given pixel (lat, lon) values
    eto_calc.param_est(ET0,
                        freq = 'D',  # daily frequence
                        # Elevation of the met station above mean sea level (m) (only needed if P is not in df).
                        z_msl = 0.,
                        lat = lat,
                        lon = lon,
                        TZ_lon = None,
                        z_u = h)  # h: height of raw wind speed measurement

    # Retrieve ET0 values
    ET0_values = eto_calc.eto_fao(max_ETo=15, min_ETo=0, interp=True, maxgap=10).values  # ETo_FAO_mm
    
    return ET0_values


def convert_interleave_mode(args: Tuple[str, str, bool]) -> None:
    """
    Convert Geotiff files obtained from OTB to Band interleave mode for faster band reading.

    Arguments
    =========
    
    (packed in args: ``tuple``)
    
    1. input_image: ``str``
        input image to convert
    2. output_image: ``str``
        output image to save
    3. remove: ``bool`` ``default = True``
        weather to remove input image

    Returns
    =======

    ``None``
    """
    
    input_image, output_image, remove = args
    
    # Open the input file in read mode
    with rio.open(input_image, "r") as src:

        # Open the output file in write mode
        with rio.open(output_image, 'w', driver = src.driver, height = src.height, width = src.width, count = src.count, dtype = src.dtypes[0], crs = src.crs, transform = src.transform, interleave = 'BAND',) as dst:

            # Loop over the blocks or windows of the input file
            for _, window in src.block_windows(1):

                # Write the data to the output file
                dst.write(src.read(window = window), window = window)
    
    # Remove unecessary image
    if remove:
        os.remove(input_image)
    
    return None


def era5Land_daily_to_yearly_pixel(list_era5land_files: List[str], output_file: str, raw_S2_image_ref: str, ndvi_path: str, h: float = 10, max_ram: int = 8, remove: bool = True) -> str:
    """
    Calculate ET0 values from the ERA5 netcdf weather variables.
    Output netcdf contains the ET0 and precipitation values for
    each day in the selected time period and reprojected
    (reprojection run on two processors) on the same grid as the
    NDVI values.

    Arguments
    =========

    1. list_era5land_files: ``List[str]``
        list of netcdf files containing the necessary variables
    2. output_file: ``str``
        output file name without extension
    3. raw_S2_image_ref: ``str``
        raw Sentinel 2 image at right resolution for reprojection
    4. ndvi_path: ``str``
        path to ndvi dataset, used for attributes and coordinates
    5. h: ``float`` ``default = 10``
        height of ERA5 wind measurements in meters
    6. max_ram: ``int`` ``default = 8``
        max ram (in GiB) for reprojection and conversion. Two
        subprocesses are spawned for OTB, each receiviving 
        half of requested memory.
    7. remove: ``bool`` ``default = True``
        weather to remove temporary files

    Returns
    =======

    1. output_file_final: ``str``
        path to ``netCDF4`` file containing precipitation and ET0 data
    """
    
    # Test if memory requirement is not loo large
    if np.ceil(virtual_memory().available / (1024**3)) < max_ram:
        print('\nRequested', max_ram, 'GiB of memory when available memory is approximately', round(virtual_memory().available / (1024**3), 1), 'GiB.\n\nExiting script.\n')
        return None
    
    # Load all monthly files into a single xarray dataset that contains all dates (daily frequency)
    raw_weather_ds = None
    for file in list_era5land_files:
        if not raw_weather_ds:
            raw_weather_ds = load_variable(file)
        else:
            temp = load_variable(file)
            raw_weather_ds = xr.merge([temp, raw_weather_ds])
    del temp
    
    # Create ET0 variable (that will be saved) and set attributes 
    raw_weather_ds = raw_weather_ds.assign(ET0 = (raw_weather_ds.dims, np.zeros(tuple(raw_weather_ds.dims[d] for d in list(raw_weather_ds.dims)), dtype = 'float64')))

    # Loop on lattitude and longitude coordinates to calculate ET0 per "pixel"
    for lat in raw_weather_ds.coords['lat'].values:
        for lon in raw_weather_ds.coords['lon'].values:
            # Select whole time period for given (lat, lon) values
            select_ds = raw_weather_ds.sel({'lat' : lat, 'lon' : lon}).drop_vars(['lat', 'lon'])

            # Calculate ET0 values for given pixel
            ET0_values = calculate_ET0_pixel(select_ds, lat, lon, h)
            
            # Write ET0 values in xarray Dataset
            raw_weather_ds['ET0'].loc[{'lat' : lat, 'lon' : lon}] = ET0_values
    
    # Get necessary data for final dataset and rewrite netcdf attributes
    final_weather_ds = raw_weather_ds.drop_vars(names = ['ssrd', 'v10', 'u10', 't2m_max', 't2m_min', 'd2m_max', 'd2m_min'])  # remove unwanted variables
    final_weather_ds['tp'] = final_weather_ds['tp'] * 1000  # conversion from m to mm
    
    # Change datatype to reduce memory usage
    final_weather_ds['tp'] = (final_weather_ds['tp']  * 100).astype('i2').chunk(chunks={"time": 1})
    final_weather_ds['ET0'] = (final_weather_ds['ET0']  * 1000).astype('i2').chunk(chunks={"time": 1})
    
    # Write projection
    final_weather_ds = final_weather_ds.rio.write_crs('EPSG:4326')
    
    # Set variable attributes 
    final_weather_ds['ET0'].attrs['units'] = 'mm'
    final_weather_ds['ET0'].attrs['standard_name'] = 'Potential evapotranspiration'
    final_weather_ds['ET0'].attrs['comment'] = 'Potential evapotranspiration accumulated over the day, calculated with the FAO-56 method (scale factor = 1000)'

    final_weather_ds['tp'].attrs['units'] = 'mm'
    final_weather_ds['tp'].attrs['standard_name'] = 'Precipitation'
    final_weather_ds['tp'].attrs['comment'] = 'Volume of total daily precipitation expressed as water height in milimeters (scale factor = 100)'

    # Save dataset to geotiff, still in wgs84 (lat, lon) coordinates
    output_file_rain = output_file + '_rain.tif'
    output_file_ET0 = output_file + '_ET0.tif'
    final_weather_ds.tp.rio.to_raster(output_file_rain, dtype = 'uint16')
    final_weather_ds.ET0.rio.to_raster(output_file_ET0, dtype = 'uint16')
    
    # Reprojected image paths
    output_file_rain_reproj = output_file + '_rain_reproj.tif'
    output_file_ET0_reproj = output_file + '_ET0_reproj.tif'
    
    # Converted image paths
    output_file_final = output_file + '.nc'
    
    # otbcli_SuperImpose commands
    OTB_command_reproj1 = 'otbcli_Superimpose -inr ' + raw_S2_image_ref + ' -inm ' + output_file_rain + ' -out ' + output_file_rain_reproj + ' uint16 -interpolator nn -ram ' + str(int(max_ram * 1024/2))
    OTB_command_reproj2 = 'otbcli_Superimpose -inr ' + raw_S2_image_ref + ' -inm ' + output_file_ET0 + ' -out ' + output_file_ET0_reproj + ' uint16 -interpolator nn -ram ' + str(int(max_ram * 1024/2))
    commands_reproj = [OTB_command_reproj1, OTB_command_reproj2]
    
    with Pool(2) as p:
        p.map(os.system, commands_reproj)
    
    # Combine to netCDF file
    combine_weather2netcdf(output_file_rain_reproj, output_file_ET0_reproj, ndvi_path, output_file_final, available_ram = max_ram)
        
    # remove old files and rename outputs
    os.remove(output_file_rain)
    os.remove(output_file_ET0)
    os.remove(output_file_rain_reproj)
    os.remove(output_file_ET0_reproj)

    return output_file_final


def era5Land_daily_to_yearly_parcel(list_era5land_files: List[str], output_file: str, h: float = 108) -> str:
    """
    Calculate ET0 values from the ERA5 netcdf weather variables.
    Output netcdf contains the ET0 and precipitation values for
    each day in the selected time period.

    Arguments
    =========

    1. list_era5land_files: ``List[str]``
        list of netcdf files containing the necessary variables
    2. output_file: ``str``
        output file name without extension
    3. h: ``float`` ``default = 10``
        height of ERA5 wind measurements in meters

    Returns
    =======

    1. output_file_rain: ``str``
        path to ``Geotiff`` file containing precipitation data
    2. output_file_ET0: ``str``
        path to ``Geotiff`` file containing ET0 data
    """
    
    # Load all monthly files into a single xarray dataset that contains all dates (daily frequency)
    raw_weather_ds = None
    for file in list_era5land_files:
        if not raw_weather_ds:
            raw_weather_ds = load_variable(file)
        else:
            temp = load_variable(file)
            raw_weather_ds = xr.merge([temp, raw_weather_ds])
    del temp
    
    # Create ET0 variable (that will be saved) and set attributes 
    raw_weather_ds = raw_weather_ds.assign(ET0 = (raw_weather_ds.dims, np.zeros(tuple(raw_weather_ds.dims[d] for d in list(raw_weather_ds.dims)), dtype = 'float64')))

    # Loop on lattitude and longitude coordinates to calculate ET0 per "pixel"
    for lat in raw_weather_ds.coords['lat'].values:
        for lon in raw_weather_ds.coords['lon'].values:
            # Select whole time period for given (lat, lon) values
            select_ds = raw_weather_ds.sel({'lat' : lat, 'lon' : lon}).drop_vars(['lat', 'lon'])

            # Calculate ET0 values for given pixel
            ET0_values = calculate_ET0_pixel(select_ds, lat, lon, h)
            
            # Write ET0 values in xarray Dataset
            raw_weather_ds['ET0'].loc[{'lat' : lat, 'lon' : lon}] = ET0_values
    
    # Get necessary data for final dataset and rewrite netcdf attributes
    final_weather_ds = raw_weather_ds.drop_vars(names = ['ssrd', 'v10', 'u10', 't2m_max', 't2m_min', 'd2m_max', 'd2m_min'])  # remove unwanted variables
    final_weather_ds['tp'] = final_weather_ds['tp'] * 1000  # conversion from m to mm
    
    # final_weather_ds.to_netcdf(output_file + '.nc', encoding = {"tp": {"dtype": "u2"}, "ET0": {"dtype": "u2"}})
    
    # return output_file + '.nc'
    
    # Change datatype to reduce memory usage
    final_weather_ds['tp'] = (final_weather_ds['tp']  * 100).astype('i2').chunk(chunks={"time": 1})
    final_weather_ds['ET0'] = (final_weather_ds['ET0']  * 1000).astype('i2').chunk(chunks={"time": 1})
    
    # Write projection
    final_weather_ds = final_weather_ds.rio.write_crs('EPSG:4326')
    
    # Set variable attributes 
    final_weather_ds['ET0'].attrs['units'] = 'mm'
    final_weather_ds['ET0'].attrs['standard_name'] = 'Potential evapotranspiration'
    final_weather_ds['ET0'].attrs['comment'] = 'Potential evapotranspiration accumulated over the day, calculated with the FAO-56 method (scale factor = 1000)'

    final_weather_ds['tp'].attrs['units'] = 'mm'
    final_weather_ds['tp'].attrs['standard_name'] = 'Precipitation'
    final_weather_ds['tp'].attrs['comment'] = 'Volume of total daily precipitation expressed as water height in milimeters (scale factor = 100)'

    # Save dataset to geotiff, still in wgs84 (lat, lon) coordinates
    output_file_rain = output_file + '_rain.tif'
    output_file_ET0 = output_file + '_ET0.tif'
    final_weather_ds.tp.rio.to_raster(output_file_rain, dtype = 'uint16')
    final_weather_ds.ET0.rio.to_raster(output_file_ET0, dtype = 'uint16')

    return output_file_rain, output_file_ET0


def extract_weather_values(args: tuple) -> List[float]:
    """
    Generate a dataframe for a given weather variable with the ERA5 Land
    weather products and a geopandas shapefile object. It iterates over
    the features of the shapefile geometry (polygons).
    This information is stored in a list.

    It returns a list that contains the weather values, a feature ``id``
    and the date for the image and every polygon in the shapefile geometry.
    It also has identification data relative to the shapefile: landcover (``LC``),
    land cover identifier (``id``) This list is returned to be later agregated
    in a ``DataFrame``.

    This function is used to allow multiprocessing for weather extraction.
    
    Arguments (packed in args: ``tuple``)
    =====================================

    weather_path: ``str``
        path to multiband Geotiff 
    shapefile: ``str``
        path to shapefile
    config_file: ``str``
        path to config file

    Returns
    =======

    weather_stats: ``List[float]``
        list containing weather values and feature information for every
        polygon in the shapefile
    """
    
    # Open arguments packed in args
    weather_path, shapefile, config_file = args
    
    # Open config file
    config_params = config(config_file)
    
    # Create dataframe where zonal statistics will be stored
    weather_stats = []
    
    # Get dates
    dates = pd.to_datetime(pd.date_range(start = config_params.start_date, end = config_params.end_date, freq = 'D')).values

    # Open ndvi image and shapefile geometry
    weather_dataset = rio.open(weather_path)

    # Get input raster spatial reference and epsg code to reproject shapefile in the same spatial reference
    target_epsg = weather_dataset.crs

    # Open shapefile with geopandas and reproject its geometry
    shapefile = gpd.read_file(shapefile)
    shapefile['geometry'] = shapefile['geometry'].to_crs(target_epsg)

    # Get no data value
    nodata = weather_dataset.nodata
    
    # Get the number of bands
    nbands = weather_dataset.count
    
    # Create progress bar
    progress_bar = tqdm(total = len(shapefile.index), desc='Extracting polygon values', unit=' polygons')

    # Loop on the individual polygons in the shapefile geometry
    for index, row in shapefile.iterrows():
        
        # Get the feature geometry as a shapely object
        geom = row.geometry
        
        # id number of the current parcel geometry
        id = index + 1
        
        # Get land cover
        LC = row.LC
        
        # Create a bounding box around the geometry
        bbox = box(*geom.bounds)
        
        # Crop the raster using the bounding box
        try:
            cropped_raster, _ = mask(weather_dataset, [bbox], crop = True, all_touched = True)
        except:
            print('\nShapefile bounds are not contained in weather dataset bounds.\n\nExiting script.')
            return None
        
        # Mask the raster using the geometry
        masked_raster, _ = mask(weather_dataset, [geom], crop = True, all_touched = True)
        
        # Replace the nodata values with nan
        cropped_raster = cropped_raster.astype(np.float32)
        cropped_raster[cropped_raster == nodata] = np.NaN
        
        masked_raster = masked_raster.astype(np.float32)
        masked_raster[masked_raster == nodata] = np.NaN
        
        # Calculate the zonal statistics
        weather_stats.extend([[dates[i], id, np.nanmean(masked_raster[i]), LC] for i in range(nbands)])
        
        # Update progress bar
        progress_bar.update(1)
    
    # Close dataset and progress bar
    weather_dataset.close()
    progress_bar.close()

    return weather_stats


def extract_weather_dataframe(rain_path: str, ET0_path: str, shapefile: str, config_file: str, save_path: str) -> None:
    """
    Extract a weather dataframe for each variable (Rain, ET0) and merge them in one
    dataframe. This dataframe is saved as ``csv`` file.

    Arguments
    =========

    1. rain_path: ``str``
        path to rain Geotiff file
    2. ET0_path: ``str``
        path to ET0 Geotiff file
    3. shapefile: ``str``
        path to shapefile
    4. config_file: ``str``
        path to config file
    5. save_path: ``str``
        save path for weather dataframe

    Returns
    =======

    ``None``
    """
    
    # Generate arguments for multiprocessing
    args = [(rain_path, shapefile, config_file), (ET0_path, shapefile, config_file)]
    
    print('\nStarting weather data extraction on two cores..\n')
    
    # Extract weather values for both weather varialbes
    with Pool(2) as p:
        results = p.map(extract_weather_values, args)
    
    # Collect results in a single dataframe
    weather_dataframe = pd.DataFrame(results[0], columns = ['date', 'id', 'Rain', 'LC'])
    weather_dataframe['ET0'] = pd.DataFrame(results[1], columns = ['date', 'id', 'ET0', 'LC'])['ET0']
    
    # Reorder columns
    weather_dataframe = weather_dataframe.reindex(columns = ['date', 'id', 'Rain', 'ET0', 'LC'])
    
    # Format datatypes
    weather_dataframe['Rain'] = np.round(weather_dataframe['Rain']).astype(int)
    weather_dataframe['ET0'] = np.round(weather_dataframe['ET0']).astype(int)
    
    # Change date type
    weather_dataframe['date'] = pd.to_datetime(weather_dataframe['date'])
    
    # Save dataframe to csv
    weather_dataframe.to_csv(save_path, index = False)
    
    return None