parcel_to_pixel.py

# -*- coding: UTF-8 -*-
# Python
"""
29-08-2023
@author: jeremy auclair

Convert pandas dataframes to rasters for the pixel mode.
"""

from typing import List, Union  # to declare variables
import numpy as np  # for math on arrays
import xarray as xr  # to manage nc files
import pandas as pd  # to manage dataframes
import geopandas as gpd  # to manage shapefiles


def convert_dataframe_to_xarray(dataframe_in: Union[str, pd.DataFrame], save_path: str, variables: List[str], data_types: List[str], time_dimension: bool = True) -> None:
    """
    Convert ``pandas dataframes`` of the parcel mode into ``xarray datasets``
    for the model calculations. The resulting xarray dataset has dimensions:
    ``time: number of dates``, ``y: 1``, ``x: number of poygons`` *(to make a 3D dataset)*,
    
    or dimensions: ``y: 1``, ``x: number of poygons`` *(to make a 2D dataset)*

    Arguments
    =========

    1. dataframe_in: ``Union[str, pd.DataFrame]``
        dataframe or path to dataframe to convert
    2. save_path: ``str``
        save path of output xarray dataset
    3. variables: ``List[str]``
        name of variables (or variable, list can have one element)
        to put in the ouput dataset
    4. data_types: ``List[str]``
        xarray datatypes corresponding the the variable names, for
        correct saving of the dataset
    5. time_dimension: ``bool`` ``default = True``
        boolean to indicate if the dataframe has a time dimension

    Returns
    =======

    ``None``
    """
    
    # If dataframe has a time dimenson (multiple dates)
    if time_dimension:
    
        if type(dataframe_in) == str:
            # Open dataframe
            dataframe = pd.read_csv(dataframe_in).sort_values(by = ['date', 'id'])
            dataframe['date'] = pd.to_datetime(dataframe['date'])
        else:
            dataframe_in.reset_index(drop = True, inplace = True)
            dataframe = dataframe_in.sort_values(by = ['date', 'id'])
            dataframe['date'] = pd.to_datetime(dataframe['date'])

        # Create dimensions for xarray dataset
        x = [i+1 for i in range(len(set(dataframe['id'])))]
        y = [1]
        dates = np.unique(dataframe['date'].values)

        # Get dimension sizes
        time_size = len(dates)
        x_size = len(x)
        y_size = len(y)

        # Create dimension tuples
        dim_size = (time_size, y_size, x_size)
        dims = ('time', 'y', 'x')

        # Reshape variables in correct format and put them in a dictionnary
        data_variables = {}
        for var in variables:
            data_variables[var] = (dims, np.reshape(dataframe[var].values, dim_size))
        
        # Create xarray dataset
        xarray_dataset = xr.Dataset(data_vars = data_variables, coords = {'time': dates, 'y': y, 'x': x})
        
        # Create encoding dictionnary
        encoding_dict = {}
        for var, dtype in zip(variables, data_types):
            # Write encoding dict
            encod = {}
            encod['dtype'] = dtype
            encod['chunksizes'] = (time_size, y_size, x_size)
            encoding_dict[var] = encod
        
        # Save dataset as netCDF4 file
        xarray_dataset.to_netcdf(save_path, encoding = encoding_dict)
    
    else:
        if type(dataframe_in) == str:
            # Open dataframe
            dataframe = pd.read_csv(dataframe_in).sort_values(by = ['id'])
        else:
            dataframe_in.reset_index(drop = True, inplace = True)
            dataframe = dataframe_in.sort_values(by = ['id'])

        # Create dimensions for xarray dataset
        x = [i+1 for i in range(len(set(dataframe['id'])))]
        y = [1]

        # Get dimension sizes
        x_size = len(x)
        y_size = len(y)

        # Create dimension tuples
        dim_size = (y_size, x_size)
        dims = ('y', 'x')

        # Reshape variables in correct format and put them in a dictionnary
        data_variables = {}
        for var in variables:
            data_variables[var] = (dims, np.reshape(dataframe[var].values, dim_size))
        
        # Create xarray dataset
        xarray_dataset = xr.Dataset(data_vars = data_variables, coords = {'y': y, 'x': x})
        
        # Create encoding dictionnary
        encoding_dict = {}
        for var, dtype in zip(variables, data_types):
            # Write encoding dict
            encod = {}
            encod['dtype'] = dtype
            encod['chunksizes'] = (y_size, x_size)
            encoding_dict[var] = encod
        
        # Save dataset as netCDF4 file
        xarray_dataset.to_netcdf(save_path, encoding = encoding_dict)
    
    return None


def convert_geodataframe_to_xarray(geodataframe_in: Union[str, gpd.GeoDataFrame, pd.DataFrame], save_path: str, name: str, variable: str, data_type: str, global_attributes: List[dict] = []) -> None:
    """
    Convert ``geopandas GeoDataDrames`` of the parcel mode into ``xarray DataArray``
    for the model calculations. The resulting xarray dataset has dimensions:
    ``y: 1``, ``x: number of poygons`` *(to make a 2D dataset)*.

    Arguments
    =========

    1. geodataframe_in: ``Union[str, gpd.GeoDataFrame, pd.DataFrame]``
        geodataframe or path to geodataframe to convert
    2. save_path: ``str``
        save path of output xarray dataset
    3. name: ``str``
        name of dataarray to save
    4. variable: ``str``
        name of variable to extract
        to put in the ouput dataset
    5. data_type: ``str``
        xarray datatype corresponding the the variable name, for
        correct saving of the dataset
    6. global_attribute: ``List[dict]``
        list of optionnal attributes to add to the netCDF file, give a list
        of single element dictionnary {key: value}

    Returns
    =======

    ``None``
    """
    
    if type(geodataframe_in) == str:
        # Open geodataframe
        geodataframe = pd.read_csv(geodataframe_in)
    else:
        geodataframe = geodataframe_in

    # Create dimensions for xarray dataset
    x = [i+1 for i in range(len(geodataframe.index.values))]
    y = [1]

    # Get dimension sizes
    x_size = len(x)
    y_size = len(y)

    # Create dimension tuples
    dim_size = (y_size, x_size)

    # Reshape variables in correct format and put them in a dictionnary
    data = np.reshape(geodataframe[variable].values, dim_size)
    
    # Create xarray dataset
    xarray_dataarray = xr.DataArray(data = data, coords = {'y': y, 'x': x}, name = name)
    
    # Create encoding dictionnary
    encoding_dict = {}
    encod = {}
    encod['dtype'] = data_type
    encod['chunksizes'] = (y_size, x_size)
    encoding_dict[name] = encod
    
    # Add potential attribute
    for attribute in global_attributes:
        key, val = list(attribute.items())[0]
        xarray_dataarray.attrs[key] = val
    
    # Save dataset as netCDF4 file
    xarray_dataarray.to_netcdf(save_path, encoding = encoding_dict)
    
    return None