biomod2_breeding_sites_public_spaces.R

# Script to use Biomod2 with raster layers describing urban landscapes and presence data of larval presence in urban spaces
# Author : Claire Teillet and Héloïse Pottier
# based on https://biomodhub.github.io/biomod2/articles/examples_1_mainFunctions.html 

#### Load package ####
library(sf)
library(biomod2)
library(dplyr)
library(terra)
library(ggplot2)
library(raster)
library(corrplot)
library(FactoMineR)
library(factoextra)
library(ggplot2)
library(GGally) 

#### Prepare data and parameters #### 

#### Load Presence Data ####
df_GU_potentiel <- read.csv("G:/0_PHD/RECHERCHE/AXE_2_BIS/data_montpellier/GU_potentiel_2154_clean_filtering_by_type.csv") 

#### Load Environmental data (tif) ####
var_env <- rast("G:/0_PHD/RECHERCHE/AXE_2_BIS/data_montpellier/grid_MTP_complete_IS_IT.tif") 

# Select only the layers if needed
# select_columns  <- c("NDVI_mean", "...")
# 
# var_env <- var_env[[select_columns]]

# Show selected layers
names(var_env)

#### Other method to select layers ####

# var_env_filtered <- subset(var_env, !grepl("FOTO", names(var_env)))
# names(var_env_filtered)

# test <- st_drop_geometry(var_env)
# png("G:/0_PHD/RECHERCHE/AXE_2_BIS/FIG/scatter_plots.png", width = 2000, height = 2000)
# pairs(test,
#       main = "Matrice de scatter plots de toutes les variables",
#       pch = 15,    
#       col = "blue",
#       cex = 2,        
#       cex.labels = 2, 
#       cex.main = 1)   
# dev.off()
# 
# ggpairs <- ggpairs(as.data.frame(test))

# Assigned 1 or 0 for if needed 
df_GU_potentiel <- df_GU_potentiel %>%
  mutate(positif = case_when(
    positif=="OUI"~"1",
    positif=="NON"~"0",
    TRUE ~ positif
  ))

df_GU_potentiel$positif <- as.integer(df_GU_potentiel$positif)

# Fix the seed for reproductibility
set.seed(123) 

# Formating data
df_GU_potentiel_biomod2 <- 
  BIOMOD_FormatingData(
    resp.var = df_GU_potentiel['positif'], # var to explain (presence data)
    resp.name = "positif", # name of the var
    resp.xy = df_GU_potentiel[, c('long', 'lat')], #XY coordinates
    expl.var = var_env, # environmental data
    
  )

plot(df_GU_potentiel_biomod2) 

#### Modeling ####

##### Individual models #####

GU_models2 <- BIOMOD_Modeling(bm.format = df_GU_potentiel_biomod2, # data
                             models = c("ANN", "CTA", "FDA", "GAM", "GBM", "GLM", "MARS", "MAXNET", "RF",
                                        "SRE", "XGBOOST"),  #vector containing several algorithms : "ANN", "MAXENT", "CTA", "FDA", "GAM", "GBM", "GLM", "MARS", "MAXNET", "RF", "SRE", "XGBOOST"
                             CV.strategy = 'kfold', #cross validation strategy : random, kfold, block, strat, env or user.defined
                             CV.nb.rep = 10, # number of repetitions for the cross validation 
                             CV.k = 10,# number of partitions of the dataset for the cross validation
                             OPT.strategy = "bigboss", # parameters of each algorithms "bigboss" or "default"
                             var.import = 3,# number of permutations for variables importance
                             nb.cpu = 16, # if cpu using all cores
                             metric.eval = c('TSS','ROC')) # evaluations metrics

# to save the model 
save(list = ls(all.names = TRUE), file = "XXX.Rdata", envir = .GlobalEnv)
# to load the model 
load("C:/Users/teill/Documents/0_PHD/MODELE/Git/biomod2/positif_0_70_0_75/positif.1723633541.models.out")

# get evaluation score 
eval_models <- get_evaluations(positif.1723633541.models.out) #or positif.1723633541.models.out if loaded

# Represent evaluation scores (TSS,AUC)
bm_PlotEvalMean(
  bm.out = GU_models2,
  metric.eval = NULL,
  dataset = "validation", # show for validation dataset
  group.by = "algo",# show by algorithms
  do.plot = TRUE   
)

#Represent variables importances by algo
bm_PlotVarImpBoxplot(bm.out = GU_models2, group.by = c('expl.var', 'algo', 'algo') ) # par models

var_importance <- get_variables_importance(GU_models2)
 
# get_models <- get_built_models(GU_models,  algo =  c("ANN", "CTA", "FDA",))

### Response Curves for individual models, selection options algo and run
bm_PlotResponseCurves(bm.out = GU_models2, 
                      models.chosen = get_built_models(GU_models2)[c(1:2, 13:14)],
                      fixed.var = 'median')
bm_PlotResponseCurves(bm.out =  GU_models2, 
                      models.chosen = get_built_models( GU_models2)[c(1:3, 12:14)],
                      fixed.var = 'min')


##### Ensemble modeling #####

# Get individual model
AllModels <- get_built_models(positif.1723633541.models.out)

# Select individual model 
AllModels <- AllModels[ grepl(pattern = "_ANN", x = AllModels, fixed = TRUE)| 
                          grepl(pattern = "_FDA", x = AllModels, fixed = TRUE) |
                          grepl(pattern = "_GLM", x = AllModels, fixed = TRUE)| 
                          grepl(pattern = "_MARS", x = AllModels, fixed = TRUE) |
                          grepl(pattern = "_MAXNET", x = AllModels, fixed = TRUE)| 
                          grepl(pattern = "_XGBOOST", x = AllModels, fixed = TRUE) ]

myBiomodEM_grep <- BIOMOD_EnsembleModeling(bm.mod = GU_models2, 
                                           em.by = 'all',
                                           em.algo = c("EMmean", "EMcv", "EMwmean", "EMmedian"), #, 'EMcv', 'EMci', 'EMmedian', 'EMca', 'EMwmean'
                                           models.chosen = AllModels,
                                           metric.select.dataset = "validation",
                                           metric.eval = c("TSS", "ROC"),
                                           var.import = 3,
                                           EMci.alpha = 0.05, # value corresponding to the significance level to estimate confidence interval
                                           EMwmean.decay = 'proportional', # A value defining the relative importance of the weights (if 'EMwmean' was given to argument em.algo). A high value will strongly discriminate good models from the bad ones (see Details), while proportional will attribute weights proportionally to the models evaluation scores
                                           nb.cpu = 16)

# load ensemble.model.out if needeed
load("positif_0_70_0_75/positif.1723633541.ensemble.models.out")

# If needeed of you can use directly a threshold to select individual model
# myBiomodEM <- BIOMOD_EnsembleModeling(bm.mod = GU_models2, # individual models 
#                                       em.by = 'all',
#                                       em.algo = c('EMmean', "EMcv", "EMwmean"), #, 'EMcv', 'EMci', 'EMmedian', 'EMca', 'EMwmean'
#                                       metric.select =  'ROC' ,
#                                       metric.select.thresh =  0.70, # threshold so select model
#                                       metric.select.dataset = "validation",
#                                       metric.eval = c('TSS', 'ROC'),
#                                       var.import = 3,
#                                       EMci.alpha = 0.05, # value corresponding to the significance level to estimate confidence interval
#                                       EMwmean.decay = 'proportional', # A value defining the relative importance of the weights (if 'EMwmean' was given to argument em.algo). A high value will strongly discriminate good models from the bad ones (see Details), while proportional will attribute weights proportionally to the models evaluation scores
#                                       nb.cpu = 16)

# Load directly the results if needed
load("C:/Users/teill/Documents/0_PHD/MODELE/Git/biomod2/positif/positif.1723633541.ensemble.models.out")

# Model use for article 1723633541.models.ensemble.out"

eval_ensemble_models <- get_evaluations(myBiomodEM_grep)
eval_ensemble_models
 

bm_PlotVarImpBoxplot(bm.out = myBiomodEM_grep, group.by = c('expl.var', 'algo', 'algo') ) # par models


#Represent response curves for ensemble modeling
load("C:/Users/teill/Documents/0_PHD/MODELE/Git/biomod2/positif_0_70_0_75/positif.1723633541.models.out")

get_variables_importance(myBiomodEM_grep)
varimp.out <- bm_PlotVarImpBoxplot(bm.out = myBiomodEM_grep, group.by = c('expl.var', 'merged.by.run','algo'), do.plot = FALSE) # par models

varimp.out$tab %>% 
  filter(algo == "EMmean") %>% 
  ggplot()+
  geom_boxplot(aes(x = expl.var, y = var.imp), fill = "#F08080")

varimp.out$tab %>% 
  filter(algo == "EMmean") %>% 
  arrange(desc(var.imp)) %>%  # order by importance scores
  mutate(expl.var = factor(expl.var, levels = unique(expl.var))) %>%   
  ggplot() +
  geom_boxplot(aes(x = expl.var, y = var.imp), fill = "#F08080") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


# Projection Forecasting
myBiomodEM_proj <- BIOMOD_EnsembleForecasting(bm.em = myBiomodEM_grep,
                                             proj.name = 'proj_positif_grep',  
                                             new.env = var_env,
                                             models.chosen = 'all',
                                             metric.binary = 'all',
                                             metric.filter = 'all',
                                             on_0_1000 = FALSE) # to get 0-1 mean probabilities
 
 
myBiomodEM_proj <- rast("G:/0_PHD/RECHERCHE/AXE_2_BIS/Biomod2/positif_0_70_0_75/proj_positif_grep/proj_positif_grep_positif_ensemble.tif")