From c1fa2153d747110118b67cc2825de6d8bb7648fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Wed, 19 Mar 2025 09:37:54 +0100
Subject: [PATCH] Implement glock block CV strategy

---
 R/03_02_absence_preparation.R                 |  55 +-
 R/03_03_model_data_finalization.R             |  79 ++
 R/04_01_modelling_ssdm.R                      | 106 ++-
 R/04_02_modelling_msdm_embed.R                |  64 +-
 R/04_03_modelling_msdm_onehot.R               |  64 +-
 ...ng_msdm_rf.R => 04_04_modelling_msdm_rf.R} |  94 +--
 R/04_06_rf_testing.R                          | 144 ----
 R/05_01_performance_report.qmd                |  16 +-
 R/05_01_performance_report_rf.qmd             | 692 ++++++++++++++++++
 R/05_02_publication_analysis.R                |  61 +-
 R/_publish.yml                                |   6 +
 11 files changed, 979 insertions(+), 402 deletions(-)
 create mode 100644 R/03_03_model_data_finalization.R
 rename R/{04_05_modelling_msdm_rf.R => 04_04_modelling_msdm_rf.R} (56%)
 delete mode 100644 R/04_06_rf_testing.R
 create mode 100644 R/05_01_performance_report_rf.qmd

diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R
index d7ef767..1b9527b 100644
--- a/R/03_02_absence_preparation.R
+++ b/R/03_02_absence_preparation.R
@@ -1,20 +1,13 @@
 # General packages
 library(dplyr)
 library(tidyr)
-library(furrr)
+library(ggplot2)
 
 # Geo packages
 library(terra)
-library(CoordinateCleaner)
 library(sf)
 library(geos)
-
-# Stat packages
-library(MASS)
-library(dismo)
-library(spatialEco)
 library(blockCV)
-library(caret)
 
 source("R/utils.R")
 
@@ -137,22 +130,9 @@ for(species in setdiff(target_species, species_processed)){
     ) %>% 
     bind_rows(abs_spec)
   
-  ref = st_bbox(sampling_region)[c(1,3,2,4)]
-  ggplot() +
-   ggtitle(species) +
-   geom_sf(data = sa_polygon) +
-   geom_sf(data = range_spec, alpha = 0, color = "red") +
-   geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) +
-   scale_color_discrete(name = "Presence") +
-   theme_minimal()
-  
-  try({
-   ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", 
-          width = 8, height = 12)
-  })
-  
+
   # Define cross-validation folds
-  pa_spec_core = dplyr::filter(pa_spec, record_type != "background")
+  pa_spec_core = dplyr::filter(pa_spec, record_type == "core")
   pa_spec_background = dplyr::filter(pa_spec, record_type == "background")
   
   folds = tryCatch({
@@ -171,21 +151,28 @@ for(species in setdiff(target_species, species_processed)){
   })
   
   pa_spec = pa_spec_core %>% 
-    dplyr::mutate(fold_eval = folds) %>% 
+    dplyr::mutate(fold_individual = folds) %>% 
     bind_rows(pa_spec_background) %>% 
     group_by(present) %>%
     mutate(weight = 1/n()) %>%
     ungroup()
   
+  # Plot result
+  ref = st_bbox(sampling_region)[c(1,3,2,4)]
+  ggplot() +
+    ggtitle(species) +
+    geom_sf(data = sa_polygon) +
+    geom_sf(data = range_spec, aes(fill = "IUCN range"), alpha = 0.1, color = NA, show.legend = "fill") +
+    geom_sf(data = sampling_region, aes(fill = "Sampling region"), alpha = 0.1, color = NA, show.legend = "fill") +
+    geom_sf(data = pa_spec, aes(color = as.factor(present), shape = as.factor(record_type)), size = 0.7) +
+    scale_fill_manual(name = "Polygons", values = c("IUCN range" = "red", "Sampling region" = "blue")) +
+    scale_color_discrete(name = "Presence") +
+    scale_shape_discrete(name = "Record type") +
+    theme_minimal()
+  
+  try({
+    ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 3)
+  })
+  
   save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData"))
 }
-
-# Combine presence-absence data
-files = list.files("data/r_objects/pa_sampling/", full.names = T)
-model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% 
-  bind_rows() %>% 
-  group_by(species) %>% 
-  dplyr::filter(all(1:5 %in% fold_eval)) %>% 
-  ungroup
-
-save(model_data, file = "data/r_objects/model_data.RData")
diff --git a/R/03_03_model_data_finalization.R b/R/03_03_model_data_finalization.R
new file mode 100644
index 0000000..8631c4d
--- /dev/null
+++ b/R/03_03_model_data_finalization.R
@@ -0,0 +1,79 @@
+library(dplyr)
+library(tidyr)
+library(ggplot2)
+library(gridExtra)
+library(sf)
+library(blockCV)
+
+source("R/utils.R")
+
+# Combine presence-absence data
+files = list.files("data/r_objects/pa_sampling/", full.names = T)
+
+model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% 
+  bind_rows() 
+  
+model_data_core = dplyr::filter(model_data, record_type == "core")
+model_data_background = dplyr::filter(model_data, record_type == "background")
+  
+# Define global folds
+spatial_folds = suppressMessages(
+  blockCV::cv_spatial(
+    model_data_core,
+    column = "present",
+    k = 5
+  )
+)
+
+model_data_core$fold_global = spatial_folds$folds_ids
+model_data = bind_rows(model_data_core, model_data_background) %>% 
+  arrange(species) %>% 
+  relocate(contains("fold"), .after = last_col()) %>% 
+  mutate(species = as.factor(species))
+
+# Explore folds assignment
+cv_plot(spatial_folds)
+ggsave(filename = "plots/publication/global_folds_map.pdf", device = "pdf", scale = 2.5)
+
+folds_summary = model_data %>% 
+  dplyr::filter(record_type == "core") %>% 
+  sf::st_drop_geometry() %>% 
+  dplyr::group_by(species, fold_global) %>% 
+  dplyr::summarise(
+    n_occ = length(which(present == 1)),
+    n_abs = length(which(present == 0))
+  ) 
+
+folds_uniform = folds_summary %>% 
+  dplyr::group_by(species, fold_global) %>% 
+  dplyr::summarise(
+    is_uniform = n_occ == 0 | n_abs == 0
+  ) %>% 
+  dplyr::group_by(species) %>% 
+  dplyr::summarise(
+    folds_total = n(),
+    folds_uniform = length(which(is_uniform)),
+    folds_mixed = folds_total - folds_uniform
+  ) %>% 
+  dplyr::ungroup()
+
+plot_1 = ggplot(folds_uniform, aes(x=folds_total)) + 
+  geom_histogram(binwidth=0.5, fill = "orange") + 
+  labs(title="Number of folds per species", x="Count", y = "Number of species") +
+  theme_minimal()
+plot_2 = ggplot(folds_uniform, aes(x=folds_uniform)) + 
+  geom_histogram(binwidth=0.5, fill = "violet") + 
+  labs(title="Number of uniform folds per species", x="Count", y = "Number of species",
+       subtitle="Uniform folds contain only absences or only presences and cannot be used in model evaluation.") +
+  theme_minimal()
+plot_3 = ggplot(folds_uniform, aes(x=folds_mixed)) + 
+  geom_histogram(binwidth=0.5, fill = "blue") + 
+  labs(title="Number of mixed folds per species", x="Count", y = "Number of species",
+       subtitle="Mixed folds both absences and presences and can be used in model evaluation.") +
+  theme_minimal()
+
+plot_combined = grid.arrange(plot_1, plot_2, plot_3, ncol = 1)
+ggsave(filename = "plots/publication/global_folds_histogram.pdf", plot_combined, device = "pdf", height = 10, width = 7)
+
+# Save Model data
+save(model_data, file = "data/r_objects/model_data.RData")
\ No newline at end of file
diff --git a/R/04_01_modelling_ssdm.R b/R/04_01_modelling_ssdm.R
index bc1346c..77c25ec 100644
--- a/R/04_01_modelling_ssdm.R
+++ b/R/04_01_modelling_ssdm.R
@@ -4,82 +4,67 @@ library(sf)
 library(caret)
 library(cito)
 library(pROC)
+library(parallel)
+library(doParallel)
+library(foreach)
 
 source("R/utils.R")
 
 load("data/r_objects/model_data.RData")
 
-# ----------------------------------------------------------------------#
-# Run training                                                       ####
-# ----------------------------------------------------------------------#
 species_processed = list.files("data/r_objects/ssdm_results/performance/", pattern = "RData") %>% 
   stringr::str_remove(".RData")
 
-data_split = model_data %>% 
-  dplyr::filter(!is.na(fold_eval) & !species %in% species_processed) %>% 
+data_split = model_data %>%
+  sf::st_drop_geometry() %>% 
+  dplyr::filter(!species %in% species_processed) %>% 
   dplyr::group_by(species) %>% 
   dplyr::group_split()
 
-for(pa_spec in data_split){
+# ----------------------------------------------------------------------#
+# Set up parallel processing                                         ####
+# ----------------------------------------------------------------------#
+num_cores <- 2
+cl <- makeCluster(num_cores)
+registerDoParallel(cl)
+
+# ----------------------------------------------------------------------#
+# Run training                                                       ####
+# ----------------------------------------------------------------------#
+# Use foreach for parallel processing across species
+foreach(pa_spec = data_split, .packages = c("dplyr", "caret", "cito", "pROC", "tidyr")) %dopar% {
+  na_performance = list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_, 
+                        precision = NA_real_, recall = NA_real_, f1 = NA_real_, 
+                        tp = NA_real_, fp = NA_real_, tn = NA_real_, fn = NA_real_)
+  
+  predictors = paste0("bio", 1:19)
+  
   species = pa_spec$species[1]
   print(species)
   
-  # Define empty result for performance eval
-  na_performance = list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, 
-                        Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, 
-                        TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_)
-  
-  # Create factor presence column 
   pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P"))
+  folds = sort(unique(na.omit(pa_spec$fold_global)))
   
   # Outer CV loop (for averaging performance metrics)
-  performance_cv = lapply(1:5, function(fold){
+  performance_cv = lapply(folds, function(fold){
     print(paste("Fold", fold))
     
     ## Preparations #####
-    data_test = dplyr::filter(pa_spec, fold_eval == fold)
-    data_train = dplyr::filter(pa_spec, is.na(fold_eval) | fold_eval != fold) # include background samples
+    data_train = dplyr::filter(pa_spec, record_type == "background" | fold_global != fold) # include background samples
+    data_test = dplyr::filter(pa_spec, fold_global == fold)
     
-    # Create inner CV folds for model training
-    folds_train = tryCatch({
-      cv_train = blockCV::cv_spatial(
-        data_train,
-        column = "present",
-        k = 5,
-        progress = F, plot = F, report = F
-      )
-      
-      cv_train$folds_ids
-    }, error = function(e){
-      NA
-    })
-    
-    if(is.numeric(folds_train)){
-      data_train$fold_train = folds_train
-    } else {
-      return()
+    if(all(data_train$present == 0)){ # If only absences in training dataset --> return
+      return(NULL)
     }
     
-    # Drop geometry
-    data_train$geometry = NULL
-    data_test$geometry = NULL
-    
-    # Define caret training routine 
-    index_train = lapply(unique(sort(data_train$fold_train)), function(x){
-      return(which(data_train$fold_train != x))
-    })
-    
     train_ctrl = caret::trainControl(
       search = "random",
       classProbs = TRUE, 
-      index = index_train,
+      number = 5,
       summaryFunction = caret::twoClassSummary, 
       savePredictions = "final"
     )
     
-    # Define predictors
-    predictors = paste0("bio", 1:19)
-    
     ## Random Forest #####
     rf_performance = tryCatch({
       # Fit model
@@ -87,9 +72,9 @@ for(pa_spec in data_split){
         x = data_train[, predictors],
         y = data_train$present_fct,
         method = "ranger",
-        metric = "Accuracy",
         trControl = train_ctrl,
         tuneLength = 8,
+        weights = data_train$weight,
         verbose = F
       )
       
@@ -106,6 +91,7 @@ for(pa_spec in data_split){
         method = "gbm",
         trControl = train_ctrl,
         tuneLength = 8,
+        weights = data_train$weight,
         verbose = F
       )
       evaluate_model(gbm_fit, data_test)
@@ -120,6 +106,7 @@ for(pa_spec in data_split){
         y = data_train$present_fct,
         method = "gamSpline",
         tuneLength = 8,
+        weights = data_train$weight,
         trControl = train_ctrl
       )
       evaluate_model(gam_fit, data_test)
@@ -137,7 +124,7 @@ for(pa_spec in data_split){
         loss = "binomial",
         epochs = 200L, 
         burnin = 200L,
-        early_stopping = 30,
+        early_stopping = 25,
         lr = 0.001,   
         batchsize = min(ceiling(nrow(data_train)/10), 64),
         dropout = 0.25,
@@ -157,7 +144,7 @@ for(pa_spec in data_split){
     performance_summary = tibble(
       species = !!species,
       obs = nrow(data_train),
-      fold_eval = fold,
+      fold_global = fold,
       model = c("rf", "gbm", "gam", "nn"),
       auc = c(rf_performance$auc, gbm_performance$auc, gam_performance$auc, nn_performance$auc),
       accuracy = c(rf_performance$accuracy, gbm_performance$accuracy, gam_performance$accuracy, nn_performance$accuracy),
@@ -170,7 +157,7 @@ for(pa_spec in data_split){
       tn = c(rf_performance$tn, gbm_performance$tn, gam_performance$tn, nn_performance$tn),
       fn = c(rf_performance$fn, gbm_performance$fn, gam_performance$fn, nn_performance$fn)
     ) %>% 
-      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") 
+      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value") 
     
     return(performance_summary)
   })
@@ -179,12 +166,15 @@ for(pa_spec in data_split){
   performance_spec = bind_rows(performance_cv) 
   if(nrow(performance_spec) != 0){
     performance_spec = performance_spec %>% 
-      dplyr::arrange(fold_eval, model, metric)
+      dplyr::arrange(fold_global, model, metric)
     
     save(performance_spec, file = paste0("data/r_objects/ssdm_results/performance/", species, ".RData"))
   }
 }
 
+# Stop the parallel cluster when done
+stopCluster(cl)
+
 # Combine results  
 files = list.files("data/r_objects/ssdm_results/performance/", full.names = T, pattern = ".RData")
 ssdm_performance = lapply(files, function(f){load(f); return(performance_spec)}) %>% 
@@ -196,7 +186,6 @@ save(ssdm_performance, file = "data/r_objects/ssdm_performance.RData")
 # Train full models                                                  ####
 # ----------------------------------------------------------------------#
 data_split = model_data %>% 
-  dplyr::filter(!is.na(fold_eval)) %>% 
   dplyr::mutate(present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))) %>%
   dplyr::group_by(species) %>% 
   dplyr::group_split()
@@ -230,7 +219,7 @@ for(pa_spec in data_split){
     summaryFunction = caret::twoClassSummary, 
     savePredictions = "final"
   )
-  
+
   # Define predictors
   predictors = paste0("bio", 1:19)
   
@@ -240,12 +229,11 @@ for(pa_spec in data_split){
       x = pa_spec[, predictors],
       y = pa_spec$present_fct,
       method = "ranger",
-      metric = "Accuracy",
       trControl = train_ctrl,
       tuneLength = 8,
+      weights = data_train$weight,
       verbose = F
     )
-    
     save(rf_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_rf_fit.RData"))
   })
   
@@ -258,6 +246,7 @@ for(pa_spec in data_split){
       method = "gbm",
       trControl = train_ctrl,
       tuneLength = 8,
+      weights = data_train$weight,
       verbose = F
     )
     save(gbm_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gbm_fit.RData"))
@@ -269,8 +258,9 @@ for(pa_spec in data_split){
       x = pa_spec[, predictors],
       y = pa_spec$present_fct,
       method = "gamSpline",
-      trControl = train_ctrl,
       tuneLength = 8,
+      weights = data_train$weight,
+      trControl = train_ctrl
     )
     save(gam_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gam_fit.RData"))
   })
@@ -285,9 +275,9 @@ for(pa_spec in data_split){
       loss = "binomial",
       epochs = 200L, 
       burnin = 200L,
-      early_stopping = 30,
+      early_stopping = 25,
       lr = 0.001,   
-      batchsize = min(ceiling(nrow(pa_spec)/10), 64),
+      batchsize = min(ceiling(nrow(data_train)/10), 64),
       dropout = 0.25,
       optimizer = config_optimizer("adam"),
       validation = 0.2,
diff --git a/R/04_02_modelling_msdm_embed.R b/R/04_02_modelling_msdm_embed.R
index 48c4e9c..9814cb5 100644
--- a/R/04_02_modelling_msdm_embed.R
+++ b/R/04_02_modelling_msdm_embed.R
@@ -7,7 +7,6 @@ source("R/utils.R")
 load("data/r_objects/model_data.RData")
 
 model_data = model_data %>% 
-  dplyr::mutate(species = as.factor(species)) %>% 
   sf::st_drop_geometry()
 
 # ----------------------------------------------------------------------#
@@ -19,31 +18,25 @@ formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " +
 # 1. Cross validation
 for(fold in 1:5){
   # Prepare data
-  train_data = dplyr::filter(model_data, fold_eval != fold)
+  data_train = model_data %>% 
+    dplyr::filter(record_type == "background" | fold_global != fold)
   
   # Run model
-  converged = F
-  while(!converged){
-    msdm_embed_fit = dnn(
-      formula,
-      data = train_data,
-      hidden = c(200L, 200L, 200L),
-      loss = "binomial",
-      epochs = 5000, 
-      lr = 0.001,   
-      batchsize = 4096,
-      dropout = 0.25,
-      burnin = 50,
-      optimizer = config_optimizer("adam"),
-      early_stopping = 200,
-      validation = 0.2,
-      device = "cuda"
-    )
-    
-    if(min(msdm_embed_fit$losses$valid_l, na.rm = T) < 0.4){
-      converged = T
-    }
-  }
+  msdm_embed_fit = dnn(
+    formula,
+    data = data_train,
+    hidden = c(200L, 200L, 200L),
+    loss = "binomial",
+    epochs = 5000, 
+    lr = 0.001,   
+    batchsize = 1024,
+    dropout = 0.25,
+    burnin = 50,
+    optimizer = config_optimizer("adam"),
+    early_stopping = 100,
+    validation = 0.2,
+    device = "cuda"
+  )
   
   save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold,".RData"))
 }
@@ -56,12 +49,11 @@ msdm_embed_fit = dnn(
   loss = "binomial",
   epochs = 5000, 
   lr = 0.001,   
-  baseloss = 1,
-  batchsize = 4096,
+  batchsize = 1024,
   dropout = 0.25,
-  burnin = 500,
+  burnin = 50,
   optimizer = config_optimizer("adam"),
-  early_stopping = 300,
+  early_stopping = 100,
   validation = 0.2,
   device = "cuda"
 )
@@ -74,15 +66,15 @@ save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed
 msdm_embed_performance = lapply(1:5, function(fold){
   load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData"))
   
-  test_data_split = model_data %>% 
-    dplyr::filter(fold_eval == fold) %>% 
+  data_test_split = model_data %>% 
+    dplyr::filter(fold_global == fold) %>% 
     dplyr::group_split(species)
   
-  lapply(test_data_split, function(test_data_spec){
-    species = test_data_spec$species[1]
+  lapply(data_test_split, function(data_test_spec){
+    species = data_test_spec$species[1]
     
     performance = tryCatch({
-      evaluate_model(msdm_embed_fit, test_data_spec)
+      evaluate_model(msdm_embed_fit, data_test_spec)
     }, error = function(e){
       list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_, 
            precision = NA_real_, recall = NA_real_, f1 = NA_real_, 
@@ -93,11 +85,11 @@ msdm_embed_performance = lapply(1:5, function(fold){
       as_tibble() %>% 
       mutate(
         species = !!species,
-        obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
-        fold_eval = !!fold,
+        obs = nrow(dplyr::filter(model_data, species == !!species, fold_global != !!fold)),
+        fold_global = !!fold,
         model = "msdm_embed",
       ) %>% 
-      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value")
+      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value")
   }) %>% 
     bind_rows()
 }) %>% 
diff --git a/R/04_03_modelling_msdm_onehot.R b/R/04_03_modelling_msdm_onehot.R
index f5e7ee3..114b606 100644
--- a/R/04_03_modelling_msdm_onehot.R
+++ b/R/04_03_modelling_msdm_onehot.R
@@ -7,7 +7,6 @@ source("R/utils.R")
 load("data/r_objects/model_data.RData")
 
 model_data = model_data %>% 
-  dplyr::mutate(species = as.factor(species)) %>% 
   sf::st_drop_geometry()
 
 # ----------------------------------------------------------------------#
@@ -19,31 +18,25 @@ formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " +
 # 1. Cross validation
 for(fold in 1:5){
   # Prepare data
-  train_data = dplyr::filter(model_data, fold_eval != fold)
+  data_train = model_data %>% 
+    dplyr::filter(record_type == "background" | fold_global != fold)
   
   # Run model
-  converged = F
-  while(!converged){
-    msdm_onehot_fit = dnn(
-      formula,
-      data = train_data,
-      hidden = c(200L, 200L, 200L),
-      loss = "binomial",
-      epochs = 5000, 
-      lr = 0.001,   
-      batchsize = 4096,
-      dropout = 0.25,
-      burnin = 50,
-      optimizer = config_optimizer("adam"),
-      early_stopping = 200,
-      validation = 0.2,
-      device = "cuda"
-    )
-    
-    if(min(msdm_onehot_fit$losses$valid_l, na.rm = T) < 0.4){
-      converged = T
-    }
-  }
+  msdm_onehot_fit = dnn(
+    formula,
+    data = data_train,
+    hidden = c(200L, 200L, 200L),
+    loss = "binomial",
+    epochs = 5000, 
+    lr = 0.001,   
+    batchsize = 1024,
+    dropout = 0.25,
+    burnin = 50,
+    optimizer = config_optimizer("adam"),
+    early_stopping = 100,
+    validation = 0.2,
+    device = "cuda"
+  )
   
   save(msdm_onehot_fit, file = paste0("data/r_objects/msdm_onehot_results/msdm_onehot_fit_fold", fold,".RData"))
 }
@@ -56,12 +49,11 @@ msdm_onehot_fit = dnn(
   loss = "binomial",
   epochs = 5000, 
   lr = 0.001,   
-  baseloss = 1,
-  batchsize = 4096,
+  batchsize = 1024,
   dropout = 0.25,
-  burnin = 500,
+  burnin = 50,
   optimizer = config_optimizer("adam"),
-  early_stopping = 300,
+  early_stopping = 100,
   validation = 0.2,
   device = "cuda"
 )
@@ -74,15 +66,15 @@ save(msdm_onehot_fit, file = paste0("data/r_objects/msdm_onehot_results/msdm_one
 msdm_onehot_performance = lapply(1:5, function(fold){
   load(paste0("data/r_objects/msdm_onehot_results/msdm_onehot_fit_fold", fold, ".RData"))
   
-  test_data_split = model_data %>% 
-    dplyr::filter(fold_eval == fold) %>% 
+  data_test_split = model_data %>% 
+    dplyr::filter(fold_global == fold) %>% 
     dplyr::group_split(species)
   
-  lapply(test_data_split, function(test_data_spec){
-    species = test_data_spec$species[1]
+  lapply(data_test_split, function(data_test_spec){
+    species = data_test_spec$species[1]
     
     performance = tryCatch({
-      evaluate_model(msdm_onehot_fit, test_data_spec)
+      evaluate_model(msdm_onehot_fit, data_test_spec)
     }, error = function(e){
       list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_, 
            precision = NA_real_, recall = NA_real_, f1 = NA_real_, 
@@ -93,11 +85,11 @@ msdm_onehot_performance = lapply(1:5, function(fold){
       as_tibble() %>% 
       mutate(
         species = !!species,
-        obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
-        fold_eval = !!fold,
+        obs = nrow(dplyr::filter(model_data, species == !!species, fold_global != !!fold)),
+        fold_global = !!fold,
         model = "msdm_onehot",
       ) %>% 
-      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value")
+      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value")
   }) %>% 
     bind_rows()
 }) %>% 
diff --git a/R/04_05_modelling_msdm_rf.R b/R/04_04_modelling_msdm_rf.R
similarity index 56%
rename from R/04_05_modelling_msdm_rf.R
rename to R/04_04_modelling_msdm_rf.R
index 0e349b8..4544b73 100644
--- a/R/04_05_modelling_msdm_rf.R
+++ b/R/04_04_modelling_msdm_rf.R
@@ -5,42 +5,35 @@ library(ranger)
 
 source("R/utils.R")
 
-load("data/r_objects/model_data_random_abs_extended.RData")
+load("data/r_objects/model_data.RData")
 
 model_data = model_data %>% 
-  dplyr::filter(!is.na(fold_eval)) %>% 
   dplyr::mutate(
-    species = as.factor(species),
     present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))
-  ) 
+  ) %>% 
+  sf::st_drop_geometry()
 
 # ----------------------------------------------------------------------#
 # Train model                                                        ####
 # ----------------------------------------------------------------------#
 # Define predictors
-predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness", "species")
+predictors = c(paste0("bio", 1:19), "species")
 
 # Cross validation
 for(fold in 1:5){
+  print(paste("Fold", fold))
   ## Preparations #####
-  data_train = dplyr::filter(model_data, fold_eval != fold) %>% 
-    sf::st_drop_geometry()
+  data_train = model_data %>% 
+    dplyr::filter(record_type == "background" | fold_global != fold)
   
-  # Define caret training routine 
   train_ctrl = caret::trainControl(
-    method = "cv",
-    number = 5,
+    search = "random",
     classProbs = TRUE, 
+    number = 5,
     summaryFunction = caret::twoClassSummary, 
     savePredictions = "final"
   )
   
-  tune_grid = expand.grid(
-    mtry = c(2,4,6,8),
-    splitrule = "gini",
-    min.node.size = c(1,4,9,16)
-  )
-  
   # Run model
   rf_fit = caret::train(
     x = data_train[, predictors],
@@ -48,53 +41,46 @@ for(fold in 1:5){
     method = "ranger",
     metric = "Accuracy",
     trControl = train_ctrl,
-    tuneGrid = tune_grid,
+    tuneLength = 8,
     weights = data_train$weight,
     num.threads = 48
   )
   
-  save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData"))
+  save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold,".RData"))
 }
 
 # Full model
 # Define caret training routine 
-full_data = model_data %>% 
-  sf::st_drop_geometry()
-
 train_ctrl = caret::trainControl(
-  method = "cv",
-  number = 5,
+  search = "random",
   classProbs = TRUE, 
+  number = 5,
   summaryFunction = caret::twoClassSummary, 
   savePredictions = "final"
 )
 
-tune_grid = expand.grid(
-  mtry = c(2,4,6,8),
-  splitrule = "gini",
-  min.node.size = c(1,4,9,16)
-)
-
 # Run model
 rf_fit = caret::train(
-  x = full_data[, predictors],
-  y = full_data$present_fct,
+  x = model_data[, predictors],
+  y = model_data$present_fct,
   method = "ranger",
   metric = "Accuracy",
   trControl = train_ctrl,
-  tuneGrid = tune_grid
+  tuneLength = 8,
+  weights = full_data$weight,
+  num.threads = 48
 )
 
-save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
+save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_full.RData")
 
 # ----------------------------------------------------------------------#
 # Evaluate model                                                     ####
 # ----------------------------------------------------------------------#
-msdm_rf_random_abs_extended_performance = lapply(1:5, function(fold){
-  load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData"))
+msdm_rf_performance = lapply(1:5, function(fold){
+  load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold, ".RData"))
   
   test_data = model_data %>% 
-    dplyr::filter(fold_eval == fold, record_type != "absence_background") %>% 
+    dplyr::filter(fold_global == fold) %>% 
     sf::st_drop_geometry()
   
   actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P"))
@@ -119,36 +105,36 @@ msdm_rf_random_abs_extended_performance = lapply(1:5, function(fold){
       cm = caret::confusionMatrix(eval_df_spec$preds, eval_df_spec$actual, positive = "P")
       
       list(
-        AUC = as.numeric(auc),
-        Accuracy = cm$overall["Accuracy"],
-        Kappa = cm$overall["Kappa"],
-        Precision = cm$byClass["Precision"],
-        Recall = cm$byClass["Recall"],
-        F1 = cm$byClass["F1"],
-        TP = cm$table["P", "P"],
-        FP = cm$table["P", "A"],
-        TN = cm$table["A", "A"],
-        FN = cm$table["A", "P"]
+        auc = as.numeric(auc),
+        accuracy = cm$overall["Accuracy"],
+        kappa = cm$overall["Kappa"],
+        precision = cm$byClass["Precision"],
+        recall = cm$byClass["Recall"],
+        f1 = cm$byClass["F1"],
+        tp = cm$table["P", "P"],
+        fp = cm$table["P", "A"],
+        tn = cm$table["A", "A"],
+        fn = cm$table["A", "P"]
       )
     }, error = function(e){
-      list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, 
-           Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, 
-           TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_)
+      list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_, 
+           precision = NA_real_, recall = NA_real_, f1 = NA_real_, 
+           tp = NA_real_, fp = NA_real_, tn = NA_real_, fn = NA_real_)
     })
     
     performance_summary = performance %>% 
       as_tibble() %>% 
       mutate(
         species = !!species,
-        obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
-        fold_eval = !!fold,
-        model = "MSDM_rf_random_abs_extended",
+        obs = nrow(dplyr::filter(model_data, species == !!species, fold_global != !!fold)),
+        fold_global = !!fold,
+        model = "msdm_rf",
       ) %>% 
-      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") %>% 
+      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value") %>% 
       drop_na()
   }) %>% 
     bind_rows()
 }) %>% 
   bind_rows()
 
-save(msdm_rf_random_abs_extended_performance, file = paste0("data/r_objects/msdm_rf_random_abs_extended_performance.RData"))
+save(msdm_rf_performance, file = paste0("data/r_objects/msdm_rf_performance.RData"))
diff --git a/R/04_06_rf_testing.R b/R/04_06_rf_testing.R
deleted file mode 100644
index 9057285..0000000
--- a/R/04_06_rf_testing.R
+++ /dev/null
@@ -1,144 +0,0 @@
-library(dplyr)
-library(tidyr)
-library(caret)
-library(ranger)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data_random_abs.RData")
-
-model_data = model_data %>% 
-  dplyr::filter(!is.na(fold_eval)) %>% 
-  dplyr::mutate(
-    species = as.factor(species),
-    present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))
-  ) 
-
-predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness", "species") 
-
-full_data = model_data %>% 
-  sf::st_drop_geometry()
-
-train_ctrl = caret::trainControl(
-  search = "random",
-  classProbs = TRUE, 
-  summaryFunction = caret::twoClassSummary, 
-  savePredictions = "final"
-)
-
-# Run model
-rf_fit = caret::train(
-  x = full_data[, predictors],
-  y = full_data$present_fct,
-  method = "ranger",
-  metric = "Accuracy",
-  trControl = train_ctrl,
-  tuneLength = 1,
-  num.threads = 48,
-  importance = 'impurity',
-  verbose = T
-)
-
-save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
-
-varimp = varImp(rf_fit)
-
-# Cross validation
-for(fold in 1:5){
-  ## Preparations #####
-  data_train = dplyr::filter(model_data, fold_eval != fold) %>% 
-    sf::st_drop_geometry()
-  
-  # Define caret training routine 
-  train_ctrl = caret::trainControl(
-    method = "cv",
-    number = 5,
-    classProbs = TRUE, 
-    summaryFunction = caret::twoClassSummary, 
-    savePredictions = "final"
-  )
-  
-  tune_grid = expand.grid(
-    mtry = c(2,4,6,8),
-    splitrule = "gini",
-    min.node.size = c(1,4,9,16)
-  )
-  
-  # Run model
-  rf_fit = caret::train(
-    x = data_train[, predictors],
-    y = data_train$present_fct,
-    method = "ranger",
-    metric = "Accuracy",
-    trControl = train_ctrl,
-    tuneGrid = tune_grid,
-    num.threads = 48,
-    verbose = F
-  )
-  
-  save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData"))
-}
-
-# ----------------------------------------------------------------------#
-# Evaluate model                                                     ####
-# ----------------------------------------------------------------------#
-msdm_rf_random_abs_performance = lapply(1:5, function(fold){
-  load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData"))
-  
-  test_data = dplyr::filter(model_data, fold_eval == fold) %>% 
-    sf::st_drop_geometry()
-  
-  actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P"))
-  probs = predict(rf_fit, test_data, type = "prob")$P
-  preds = predict(rf_fit, test_data, type = "raw")
-  
-  eval_dfs = data.frame(
-    species = test_data$species,
-    actual,
-    probs,
-    preds
-  ) %>% 
-    group_by(species) %>% 
-    group_split()
-  
-  
-  lapply(eval_dfs, function(eval_df_spec){
-    species = eval_df_spec$species[1]
-    
-    performance = tryCatch({
-      auc = pROC::roc(eval_df_spec$actual, eval_df_spec$probs, levels = c("P", "A"), direction = ">")$auc
-      cm = caret::confusionMatrix(eval_df_spec$preds, eval_df_spec$actual, positive = "P")
-      
-      list(
-        AUC = as.numeric(auc),
-        Accuracy = cm$overall["Accuracy"],
-        Kappa = cm$overall["Kappa"],
-        Precision = cm$byClass["Precision"],
-        Recall = cm$byClass["Recall"],
-        F1 = cm$byClass["F1"],
-        TP = cm$table["P", "P"],
-        FP = cm$table["P", "A"],
-        TN = cm$table["A", "A"],
-        FN = cm$table["A", "P"]
-      )
-    }, error = function(e){
-      list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, 
-           Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, 
-           TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_)
-    })
-    
-    performance_summary = performance %>% 
-      as_tibble() %>% 
-      mutate(
-        species = !!species,
-        obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
-        fold_eval = !!fold,
-        model = "MSDM_rf_random_abs",
-      ) %>% 
-      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value")
-  }) %>% 
-    bind_rows()
-}) %>% 
-  bind_rows()
-
-save(msdm_rf_random_abs_performance, file = paste0("data/r_objects/msdm_rf_random_abs_performance.RData"))
diff --git a/R/05_01_performance_report.qmd b/R/05_01_performance_report.qmd
index bc5a0b7..8fb626b 100644
--- a/R/05_01_performance_report.qmd
+++ b/R/05_01_performance_report.qmd
@@ -12,9 +12,9 @@ library(plotly)
 library(DT)
 
 load("../data/r_objects/model_data.RData")
-load("../data/r_objects/ssdm_performance.RData")
+#load("../data/r_objects/ssdm_performance.RData")
 load("../data/r_objects/msdm_embed_performance.RData")
-load("../data/r_objects/msdm_onehot_performance.RData")
+#load("../data/r_objects/msdm_onehot_performance.RData")
 load("../data/r_objects/msdm_rf_performance.RData")
 
 load("../data/r_objects/range_maps.RData")
@@ -56,16 +56,12 @@ loess_fit = function(x, y, span = 0.75){
 }
 
 # Performance table
-performance = ssdm_results %>% 
-  bind_rows(msdm_embed_performance) %>% 
-  bind_rows(msdm_onehot_performance) %>% 
+performance = msdm_embed_performance %>% 
+  #bind_rows(msdm_embed_performance) %>% 
+  #bind_rows(msdm_onehot_performance) %>% 
   bind_rows(msdm_rf_performance) %>% 
-  dplyr::mutate(   # TODO: remove when only working with new data
-    metric = stringr::str_to_lower(metric),
-    model = stringr::str_to_lower(model)
-  ) %>% 
   dplyr::group_by(species, model, metric) %>% 
-  dplyr::summarise(value = mean(value, na.rm = F)) %>% 
+  dplyr::summarise(value = mean(value, na.rm = T)) %>% 
   dplyr::mutate(
     value = case_when(
       ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5,
diff --git a/R/05_01_performance_report_rf.qmd b/R/05_01_performance_report_rf.qmd
new file mode 100644
index 0000000..534a5f8
--- /dev/null
+++ b/R/05_01_performance_report_rf.qmd
@@ -0,0 +1,692 @@
+---
+title: "SDM Performance report"
+format: html
+editor: visual
+engine: knitr
+---
+
+```{r init, echo = FALSE, include = FALSE}
+library(tidyverse)
+library(sf)
+library(plotly)
+library(DT)
+
+load("../data/r_objects/model_data.RData")
+load("../data/r_objects/range_maps.RData")
+load("../data/r_objects/range_maps_gridded.RData")
+load("../data/r_objects/occs_final.RData")
+load("../data/r_objects/functional_groups.RData")
+
+sf::sf_use_s2(use_s2 = FALSE)
+
+# Load performance of different RF versions
+load("../data/r_objects/deprecated/msdm_rf_no_species_random_abs_performance.RData")
+load("../data/r_objects/deprecated/msdm_rf_no_species_performance.RData")
+load("../data/r_objects/msdm_rf_performance.RData")
+
+msdm_rf_no_species_performance$model = "rf_noSpec_oldVars_informedAbs"
+msdm_rf_no_species_performance$metric = tolower(msdm_rf_no_species_performance$metric)
+msdm_rf_no_species_random_abs_performance$model = "rf_noSpec_oldVars_randomAbs"
+msdm_rf_no_species_random_abs_performance$metric = tolower(msdm_rf_no_species_random_abs_performance$metric)
+msdm_rf_performance$model = "rf_noSpec_newVars_randomAbs"
+```
+
+```{r globals, echo = FALSE, cache = TRUE, include = FALSE}
+# Regression functions
+asym_fit = function(x, y){
+  nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1))
+  new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100))
+  data.frame(
+    x = new_x,
+    fit = predict(nls_fit, newdata = data.frame(x = new_x))
+  )
+}
+
+glm_fit = function(x, y, family = "binomial"){
+  glm_fit = suppressWarnings(glm(y~x, family = family))
+  new_x = seq(min(x), max(x), length.out = 100)
+  data.frame(
+    x = new_x,
+    fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response")
+  )
+}
+
+loess_fit = function(x, y, span = 0.75){
+  df = data.frame(x = x, y = y)
+  loess_fit = loess(y ~ x, data = df, span = span)
+  new_x = seq(min(x), max(x), length.out = 100)
+  data.frame(
+    x = new_x,
+    fit = predict(loess_fit, newdata = data.frame(x = new_x))
+  )
+}
+
+# Performance table
+performance = msdm_rf_performance %>% 
+  bind_rows(msdm_rf_no_species_performance) %>% 
+  bind_rows(msdm_rf_no_species_random_abs_performance) %>% 
+  dplyr::group_by(species, model, metric) %>% 
+  dplyr::summarise(value = mean(value, na.rm = F)) %>% 
+  dplyr::mutate(
+    value = case_when(
+      ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5,
+      ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0,
+      .default = value
+    )
+  ) %>% 
+  ungroup()
+
+plot_performance = function(df_plot, y_var, y_label, x_var, x_label, x_log = T, reg_func = loess_fit) {
+  df_plot = df_plot %>% 
+    dplyr::rename(
+      x_var = !!x_var,
+      y_var = !!y_var
+    )
+  
+  # Calculate regression lines for each model and metric combination
+  suppressWarnings({
+    regression_lines = df_plot %>%
+      group_by(model) %>%
+      group_modify( ~ reg_func(.x[["x_var"]], .x[["y_var"]]))
+  })
+  
+  # Create base plot
+  plot <- plot_ly() %>%
+    layout(
+      title = paste0("Model Performance vs. ", x_label),
+      xaxis = list(title = x_label, type = ifelse(x_log, "log", "linear")),
+      yaxis = list(title = y_label),
+      legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot
+      margin = list(r = 150), # Add right margin to accommodate legend
+      hovermode = 'closest'
+    )
+  
+  # Points
+  for (model_name in unique(df_plot$model)) {
+    plot = plot %>%
+      add_markers(
+        data = filter(df_plot, model == model_name),
+        x = ~ x_var,
+        y = ~ y_var,
+        color = model_name, # Set color to match legendgroup
+        legendgroup = model_name,
+        opacity = 0.6,
+        name = ~ model,
+        hoverinfo = 'text',
+        text = ~ paste(
+          "Species:", species, "<br>", x_label, ":", x_var, "<br>Value:", round(y_var, 3)
+        )
+      )
+  }
+  
+  # Add regression lines
+  for (model_name in unique(df_plot$model)) {
+    reg_data = dplyr::filter(regression_lines, model == model_name)
+    plot = plot %>%
+      add_lines(
+        data = reg_data,
+        x = ~ x,
+        y = ~ fit,
+        color = model_name, # Set color to match legendgroup
+        legendgroup = model_name,
+        name = paste(model_name, '(fit)'),
+        showlegend = FALSE
+      )
+  }
+  
+  return(plot)
+}
+
+add_partial_residuals = function(df, focus_var, control_vars, family = quasibinomial) {
+  model_formula <- as.formula(paste("value ~", paste(c(focus_var, control_vars), collapse = " + ")))
+  model = glm(model_formula, data = df, family = family)
+  
+  if (focus_var %in% names(model$coefficients)) {
+    df[[paste0(focus_var, "_partial")]] = residuals(model, type = "response") + model$coefficients[focus_var] * df[[focus_var]]
+  }
+  
+  return(df)
+}
+
+```
+
+## Summary
+
+This document contrasts the performance of different variations of random forest models. 
+
+## Analysis
+
+### Quantify drivers of model performance
+
+```{r prepare_model_df, echo = FALSE, include = FALSE}
+# Number of records
+df_join = model_data %>% 
+  sf::st_drop_geometry() %>% 
+  dplyr::filter(present == 1) %>% 
+  dplyr::group_by(species) %>% 
+  dplyr::summarise(n_obs = n())
+
+performance = performance %>% 
+  dplyr::left_join(df_join, by = "species") 
+
+# Range size
+df_join = range_maps %>% 
+  sf::st_transform("+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs") %>% 
+  dplyr::mutate(range_size = as.numeric(sf::st_area(range_maps) / 1000000)) %>%  # range in sqkm
+  sf::st_drop_geometry()
+
+performance = performance %>% 
+  inner_join(df_join, by = c("species" = "name_matched"))
+
+# Range coverage
+range_maps_gridded = range_maps_gridded %>%
+  sf::st_transform("+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs")
+
+df_cells_total = range_maps_gridded %>%
+  dplyr::rename("species" = name_matched) %>% 
+  group_by(species) %>%
+  summarise(cells_total = n()) %>% 
+  st_drop_geometry()
+
+df_cells_occ = range_maps_gridded %>% 
+  st_join(occs_final, join = st_intersects) %>%
+  filter(name_matched == species) %>%         # Filter only intersections of the same species
+  group_by(species) %>%
+  summarise(cells_occupied = n_distinct(geometry)) %>%
+  st_drop_geometry()
+
+df_join = df_cells_total %>% 
+  dplyr::inner_join(df_cells_occ, by = "species") %>% 
+  dplyr::mutate(range_cov = cells_occupied / cells_total) %>% 
+  dplyr::select(species, range_cov)
+
+performance = performance %>% 
+  inner_join(df_join, by = "species")
+
+# Range coverage bias
+df_occs_total = occs_final %>% 
+  st_drop_geometry() %>% 
+  group_by(species) %>% 
+  summarise(occs_total = n())
+
+df_join = df_occs_total %>% 
+  dplyr::inner_join(df_cells_total, by = "species") %>% 
+  dplyr::inner_join(df_cells_occ, by = "species") %>% 
+  dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1))) %>%
+  dplyr::select(species, range_bias)
+
+performance = performance %>% 
+  inner_join(df_join, by = "species")
+```
+
+### Number of records
+
+::: panel-tabset
+#### Marginal Effects
+
+::: panel-tabset
+##### AUC
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "auc")
+plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+
+##### F1
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "f1")
+plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+
+##### Cohen's kappa
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "kappa")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+
+##### Accuracy
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "accuracy")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+
+##### Precision
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "precision")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+
+##### Recall
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "recall")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+:::
+
+#### Partial Effects
+
+::: panel-tabset
+##### AUC
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "auc") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+
+##### F1
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "f1") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+
+##### Cohen's kappa
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "kappa") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian)
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+
+##### Accuracy
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "accuracy") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+
+##### Precision
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "precision") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+
+##### Recall
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "recall") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "n_obs", x_label = "Number of records")
+bslib::card(plot, full_screen = T)
+```
+:::
+:::
+
+### Range size
+
+Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016).
+
+::: panel-tabset
+#### Marginal Effects
+
+::: panel-tabset
+##### AUC
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "auc")
+plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_size", x_label = "Range size [km]")
+bslib::card(plot, full_screen = T)
+```
+
+##### F1
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "f1")
+plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_size", x_label = "Range size [km]")
+bslib::card(plot, full_screen = T)
+```
+
+##### Cohen's kappa
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "kappa")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_size", x_label = "Range size [km]")
+bslib::card(plot, full_screen = T)
+```
+
+##### Accuracy
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "accuracy")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_size", x_label = "Range size [km]")
+bslib::card(plot, full_screen = T)
+```
+
+##### Precision
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "precision")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_size", x_label = "Range size [km]")
+bslib::card(plot, full_screen = T)
+```
+
+##### Recall
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "recall")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_size", x_label = "Range size [km]")
+bslib::card(plot, full_screen = T)
+```
+:::
+
+#### Partial Effects
+
+::: panel-tabset
+##### AUC
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "auc") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_size", x_label = "Range size [km]")
+bslib::card(plot, full_screen = T)
+```
+
+##### F1
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "f1") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_size", x_label = "Range size [km²]")
+bslib::card(plot, full_screen = T)
+```
+
+##### Cohen's kappa
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "kappa") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian)
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_size", x_label = "Range size [km²]")
+bslib::card(plot, full_screen = T)
+```
+
+##### Accuracy
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "accuracy") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_size", x_label = "Range size [km²]")
+bslib::card(plot, full_screen = T)
+```
+
+##### Precision
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "precision") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_size", x_label = "Range size [km²]")
+bslib::card(plot, full_screen = T)
+```
+
+##### Recall
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "recall") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_size", x_label = "Range size [km²]")
+bslib::card(plot, full_screen = T)
+```
+:::
+:::
+
+### Range coverage
+
+Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells.
+
+$$
+RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}}
+$$
+
+::: panel-tabset
+#### Marginal Effects
+
+::: panel-tabset
+##### AUC
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "auc")
+plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+
+##### F1
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "f1")
+plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+
+##### Cohen's kappa
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "kappa")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+
+##### Accuracy
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "accuracy")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+
+##### Precision
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "precision")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+
+##### Recall
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "recall")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+:::
+
+#### Partial Effects
+
+::: panel-tabset
+##### AUC
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "auc") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+
+##### F1
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "f1") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+
+##### Cohen's kappa
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "kappa") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian)
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+
+##### Accuracy
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "accuracy") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+
+##### Precision
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "precision") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+
+##### Recall
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "recall") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_cov", x_label = "Range coverage")
+bslib::card(plot, full_screen = T)
+```
+:::
+:::
+
+### Range coverage bias
+
+Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range.
+
+$$
+RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)}
+$$
+
+Higher bias values indicate that occurrence records are spatially more clustered within the range of the species.
+
+::: panel-tabset
+#### Marginal Effects
+
+::: panel-tabset
+##### AUC
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "auc")
+plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+
+##### F1
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "f1")
+plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+
+##### Cohen's kappa
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "kappa")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+
+##### Accuracy
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "accuracy")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+
+##### Precision
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "precision")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+
+##### Recall
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "recall")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+:::
+
+#### Partial Effects
+
+::: panel-tabset
+##### AUC
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "auc") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+
+##### F1
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "f1") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+
+##### Cohen's kappa
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "kappa") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian)
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+
+##### Accuracy
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "accuracy") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+
+##### Precision
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "precision") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+
+##### Recall
+
+```{r echo = FALSE}
+df_plot = dplyr::filter(performance, metric == "recall") %>% 
+  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
+bslib::card(plot, full_screen = T)
+```
+:::
+:::
diff --git a/R/05_02_publication_analysis.R b/R/05_02_publication_analysis.R
index 9e2d074..4d347c4 100644
--- a/R/05_02_publication_analysis.R
+++ b/R/05_02_publication_analysis.R
@@ -13,10 +13,11 @@ library(cito)
 
 source("R/utils.R")
 
-load("data/r_objects/model_data_random_abs_extended.RData")
-load("data/r_objects/ssdm_results.RData")
+load("data/r_objects/model_data.RData")
+load("data/r_objects/ssdm_performance.RData")
 load("data/r_objects/msdm_embed_performance.RData")
 load("data/r_objects/msdm_onehot_performance.RData")
+load("data/r_objects/msdm_rf_performance.RData")
 load("data/r_objects/functional_groups.RData")
 
 load("data/r_objects/sa_polygon.RData")
@@ -25,26 +26,27 @@ load("data/r_objects/range_maps.RData")
 sf::sf_use_s2(use_s2 = FALSE)
 
 model_data = model_data %>% 
-  dplyr::filter(!is.na(fold_eval)) %>% 
   dplyr::mutate(species = as.factor(species))
 
 # ------------------------------------------------------------------ #
 # 1. Collect performance results                                  ####
 # ------------------------------------------------------------------ #
-performance = ssdm_results %>% 
+performance = ssdm_performance %>% 
   bind_rows(msdm_embed_performance) %>% 
   bind_rows(msdm_onehot_performance) %>% 
-  dplyr::group_by(species, model, metric) %>% 
-  dplyr::summarise(value = mean(value, na.rm = F)) %>% 
+  bind_rows(msdm_rf_performance) %>% 
   dplyr::mutate(
     metric = factor(tolower(metric), levels = c("auc", "f1", "kappa", "accuracy", "precision", "recall")),
-    model = factor(model, levels = c("GAM", "GBM", "RF", "NN", "MSDM_onehot", "MSDM_embed")),
+    model = factor(model, levels = c("gam", "gbm", "rf", "nn", "msdm_onehot", "msdm_embed", "msdm_rf")),
     value = case_when(
       ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5,
       ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0,
       .default = value
     )
-  )
+  ) %>% 
+  dplyr::group_by(species, model, metric) %>% 
+  dplyr::summarise(value = mean(value, na.rm = F)) %>% 
+  ungroup()
 
 # ------------------------------------------------------------------ #
 # 2. Model comparison                                             ####
@@ -81,13 +83,13 @@ obs_count = model_data %>%
   sf::st_drop_geometry() %>% 
   dplyr::filter(present == 1, !is.na(fold_eval)) %>% 
   dplyr::group_by(species) %>% 
-  dplyr::summarise(obs = n())
+  dplyr::summarise(n_obs = n())
 
 df_plot = performance %>% 
   dplyr::filter(metric %in% c("auc", "f1", "kappa", "accuracy", "precision", "recall")) %>% 
   dplyr::left_join(obs_count, by = "species") 
 
-ggplot(df_plot, aes(x = obs, y = value, color = model, fill = model)) +
+ggplot(df_plot, aes(x = n_obs, y = value, color = model, fill = model)) +
   geom_point(alpha = 0.1) +
   geom_smooth() + 
   facet_wrap(~ metric, scales = "free_y") +
@@ -157,18 +159,24 @@ library(ranger)
 plot_predictions = function(spec, model_data, raster_data, algorithms){
   # Species data
   load("data/r_objects/range_maps.RData")
+  load("data/r_objects/sa_polygon.RData")
   
   pa_spec = model_data %>% 
-    dplyr::filter(species == !!spec)
+    dplyr::filter(species == !!spec, record_type == "core")
   range_spec = range_maps %>% 
     dplyr::filter(name_matched == !!spec) %>% 
     sf::st_transform(sf::st_crs(pa_spec))
-  
+  sa_polygon = sa_polygon %>% 
+    sf::st_transform(crs(raster_data, proj = T))
+    
   # Extract raster values into df
   bbox_spec = sf::st_bbox(range_spec) %>% 
-    expand_bbox(expansion = 0.25)
+    expand_bbox(expansion = 0.5)
+  
+  raster_crop = raster_data %>% 
+    terra::crop(bbox_spec) %>% 
+    terra::mask(vect(sa_polygon))
   
-  raster_crop = terra::crop(raster_data, bbox_spec)
   pa_crop = pa_spec[st_intersects(pa_spec, st_as_sfc(bbox_spec), sparse = FALSE),]
   
   new_data = raster_crop %>% 
@@ -191,15 +199,6 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){
     } else if(algorithm == "msdm_rf"){
       load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData")
       predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
-    } else if(algorithm == "msdm_rf_random_abs"){
-      load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
-      predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
-    } else if(algorithm == "msdm_rf_fit_no_species"){
-      load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_full.RData")
-      predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
-    } else if(algorithm == "msdm_rf_fit_no_species_random_abs"){
-      load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_random_abs_full.RData")
-      predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
     } else if(algorithm == "nn") {
       load(paste0("data/r_objects/ssdm_results/full_models/", spec, "_nn_fit.RData"))
       new_data_tmp = dplyr::select(new_data, -species)
@@ -218,7 +217,7 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){
     p = ggplot() +
       tidyterra::geom_spatraster(data = as.factor(raster_pred), maxcell = 5e7) +
       scale_fill_manual(values = c("0" = "black", "1" = "yellow"), name = "Prediction", na.translate = FALSE) +
-      geom_sf(data = pa_crop, aes(shape = as.factor(present)), color = "#FFFFFF99") +
+      geom_sf(data = pa_crop, aes(shape = as.factor(present)), color = "#FF000088") +
       geom_sf(data = range_spec, col = "red", fill = NA) +
       scale_shape_manual(values = c("0" = 1, "1" = 4), name = "Observation") +
       theme_minimal() +
@@ -236,16 +235,17 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){
 
 # Load raster
 raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% 
+  stringr::str_subset("CHELSA_bio") %>% 
   stringr::str_sort(numeric = T) 
 
 raster_data = terra::rast(raster_filepaths) %>% 
   terra::crop(sf::st_bbox(sa_polygon)) %>% 
   terra::project(sf::st_crs(model_data)$input)
 
-specs = sort(sample(levels(model_data$species), 4))
+specs = c("Histiotus velatus", "Hydrochoerus hydrochaeris", "Euryoryzomys legatus", "Eumops trumbulli")
 for(spec in specs){
-  pdf(file = paste0("plots/range_predictions/", spec, " (msdm_rf).pdf"), width = 12)
-  plots = plot_predictions(spec, model_data, raster_data, algorithms = c("msdm_rf", "msdm_rf_random_abs", "msdm_rf_fit_no_species", "msdm_rf_fit_no_species_random_abs"))
+  pdf(file = paste0("plots/range_predictions/", spec, ".pdf"), width = 12)
+  plots = plot_predictions(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "msdm_onehot", "msdm_embed", "msdm_rf"))
   lapply(plots, print)
   dev.off()
 }
@@ -256,7 +256,6 @@ for(spec in specs){
 load("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData")
 load("data/r_objects/msdm_onehot_results/msdm_onehot_fit_full.RData")
 load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData")
-load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
 
 compare_species_predictions = function(model, sample_size){
   specs = sample(unique(model_data$species), 2, replace = F)
@@ -329,7 +328,7 @@ obs_count = model_data %>%
   sf::st_drop_geometry() %>% 
   dplyr::filter(present == 1, !is.na(fold_eval)) %>% 
   dplyr::group_by(species) %>% 
-  dplyr::summarise(obs = n())
+  dplyr::summarise(n_obs = n())
 
 all_embeddings = lapply(1:5, function(fold){
   load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData"))
@@ -377,7 +376,7 @@ r_embed_df = data.frame(
 df_plot = obs_count %>% 
   dplyr::left_join(r_embed_df, by = "species")
 
-ggplot(data = df_plot, aes(x = obs, y = value)) +
+ggplot(data = df_plot, aes(x = n_obs, y = value)) +
   geom_point() +
   geom_smooth() +
   labs(title = "Mean and variance of correlation coefficient of species embeddings across folds") +
@@ -557,3 +556,5 @@ range_dist_subset = range_dist[species_intersect, species_intersect]
 e_indices = match(species_intersect, species_lookup$species)
 embeddings_dist_subset = as.matrix(embeddings_dist)[e_indices, e_indices]
 FactoMineR::coeffRV(embeddings_dist_subset, range_dist_subset)
+
+# --> Significant correlation between embeddings and pairwise species distances (range > func > phylo)
diff --git a/R/_publish.yml b/R/_publish.yml
index c18ea3e..5604c70 100644
--- a/R/_publish.yml
+++ b/R/_publish.yml
@@ -2,3 +2,9 @@
   quarto-pub:
     - id: 77b30762-b473-447d-be18-d4bbd6261fbf
       url: 'https://chrkoenig.quarto.pub/sdm-performance-report'
+    - id: 7777d450-82af-4364-b4c1-b52d7139ade0
+      url: 'https://chrkoenig.quarto.pub/rf-nospec-performance-report'
+- source: 05_01_performance_report_rf.qmd
+  quarto-pub:
+    - id: 98e776a5-fb0b-41f9-8733-446bf7ffb272
+      url: 'https://chrkoenig.quarto.pub/sdm-performance-report-rf'
-- 
GitLab