diff --git a/.gitignore b/.gitignore
index 3acf079a9890636921e00602124682ed4a556f26..4d2ad7cd89b5eac9b061262753fe098518070806 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,4 @@ renv/cache/
 data/
 plots/
 R/*/
-R/*.html
\ No newline at end of file
+*.html
\ No newline at end of file
diff --git a/R/01_01_range_map_preparation.R b/R/01_01_range_preparation.R
similarity index 99%
rename from R/01_01_range_map_preparation.R
rename to R/01_01_range_preparation.R
index eb33fdc1b726ab35bf7fa5bd97ec638740f30600..4f6a412243451e6b38f5ecc8358aa95083cc4164 100644
--- a/R/01_01_range_map_preparation.R
+++ b/R/01_01_range_preparation.R
@@ -9,7 +9,7 @@ sf::sf_use_s2(use_s2 = FALSE)
 # --------------------------------------#
 #       Process range maps           ####
 # --------------------------------------#
-# Load range maps
+# Load IUCN range maps
 range_maps = st_read(
   "~/share/groups/mas_data/Saved_Data_Dropbox_Business/Datensätze/Range Maps/IUCN_range_maps_mammals_version2016/TERRESTRIAL_MAMMALS.shp",
   geometry_column = "geometry",
diff --git a/R/01_02_raster_preparation.R b/R/01_02_raster_preparation.R
deleted file mode 100644
index e7876e935c8b77377e003bb17490e704abc9246e..0000000000000000000000000000000000000000
--- a/R/01_02_raster_preparation.R
+++ /dev/null
@@ -1,36 +0,0 @@
-library(tidyverse)
-library(terra)
-library(rnaturalearth)
-library(furrr)
-
-# Define Study extent ####
-sa_polygon = rnaturalearth::ne_countries(scale = 10, returnclass = "sf") %>% 
-  dplyr::filter(continent == "South America") %>% 
-  sf::st_union() %>% 
-  st_crop(xmin = -82, ymin = -56, xmax = -34, ymax = 13)
-
-save(sa_polygon, file = "data/r_objects/sa_polygon.RData")
-
-# CHELSA bioclim target variables ####
-chelsa_urls = paste0("https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_bio", 1:19,"_1981-2010_V.2.1.tif")
- 
-chelsa = terra::rast(chelsa_urls) %>% 
-  crop(sa_polygon)
-set.names(chelsa, paste0("bio", 1:19))
-terra::writeRaster(chelsa, filename = file.path("data", "geospatial", "raster", basename(chelsa_urls)), overwrite = T)
-
-# Tree canopy cover ####
-igfc = list.files("I:/mas_data/00_data/processed/iGFC/", pattern = "canopyDensity_(199[0-9]|200[0-9]|2010)", full.names = TRUE) %>% 
-  terra::rast() %>% 
-  terra::resample(chelsa) %>%   
-  app(mean, na.rm = TRUE)
-terra::set.names(igfc, "igfc")
-terra::writeRaster(igfc, filename = file.path("data", "geospatial", "raster", "mean_canopyDensity_1992-2010.tif"), overwrite = T)
-
-# Terrain ####
-roughness = list.files("C:/Users/ye87zine/Downloads", pattern = "roughness", full.names = T) %>% 
-  terra::rast() %>% 
-  crop(study_extent) %>% 
-  terra::resample(chelsa)
-terra::set.names(roughness, "roughness")
-terra::writeRaster(roughness, filename = file.path("data", "geospatial", "raster", "terrain_roughness.tif"), overwrite = T)
diff --git a/R/02_02_functional_traits_preparation.R b/R/01_02_traits_preparation.R
similarity index 64%
rename from R/02_02_functional_traits_preparation.R
rename to R/01_02_traits_preparation.R
index dfec715e7946b3ed97f82c21763d1798622a11c4..0e480830dfd26d97657bcc3926131fece0d5616b 100644
--- a/R/02_02_functional_traits_preparation.R
+++ b/R/01_02_traits_preparation.R
@@ -1,6 +1,49 @@
 library("tidyverse")
 library("traitdata")
+library("Symobio")
+library("vegan")
+
+load("data/r_objects/range_maps.RData")
+
+# -------------------------------------------#
+#       Assign functional groups          ####
+# -------------------------------------------#
+# Get taxonomic information for target species
+names_unique = unique(range_maps$name_matched[!is.na(range_maps$name_matched)])
+
+species_matched = lapply(names_unique, function(name){
+  match_result = Symobio::gbif_match_name(name = name)
+  if(match_result$status != "ACCEPTED"){ # Search for accepted name
+    match_result = gbif_match_name(usageKey = match_result$acceptedUsageKey)
+  }
+  match_result$name_orig = name
+  return(match_result)
+}) %>% 
+  bind_rows()
+
+# Assign functional groups
+functional_groups = species_matched %>% 
+  mutate(
+    functional_group = case_when(
+      order %in% c("Carnivora", "Artiodactyla", "Cingulata", "Perissodactyla") ~ 1,
+      order %in% c("Rodentia", "Didelphimorphia", "Soricomorpha", "Paucituberculata", "Lagomorpha") ~ 2,
+      order %in% c("Primates", "Pilosa") ~ 3,
+      order %in% c("Chiroptera") ~ 4
+    ),
+    functional_group = factor(functional_group, labels = c("large ground-dwelling", "small ground-dwelling", "arboreal", "flying"))
+  ) %>% 
+  dplyr::select(
+    name_orig,
+    name_matched = species,
+    functional_group
+  ) %>% 
+  distinct()
 
+save(functional_groups, file = "data/r_objects/functional_groups.RData")
+
+# ---------------------------------------------------#
+#        Process and merge functional traits      ####
+# ---------------------------------------------------#
 # Match names
 traits = traitdata::elton_mammals
 names_unique = unique(traits$scientificNameStd)
@@ -69,12 +112,9 @@ traits_proc = traits_matched %>%
 
 save(traits_proc, file = "data/r_objects/traits_proc.RData")
 
-# Calculate Distances
-library("vegan")
-
-load("data/r_objects/range_maps.RData")
-load("data/r_objects/traits_proc.RData")
-
+# ---------------------------------------------------#
+#        Calculate functional dissimilarity       ####
+# ---------------------------------------------------#
 target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)])
 traits_target = dplyr::filter(traits_proc, species %in% target_species)
   
diff --git a/R/02_03_phylo_preparation.R b/R/01_03_phylo_preparation.R
similarity index 84%
rename from R/02_03_phylo_preparation.R
rename to R/01_03_phylo_preparation.R
index a72c8332e080f96fa3c45aab2a7e955fa5be0f81..4ba9a85f34c5d325b4383234363b0bc06b339804 100644
--- a/R/02_03_phylo_preparation.R
+++ b/R/01_03_phylo_preparation.R
@@ -6,7 +6,9 @@ library(Symobio)
 forest = read.nexus("data/phylogenies/Complete_phylogeny.nex")
 load("data/r_objects/range_maps_names_matched.RData")
 
-# Get taxonomic information for target species
+# ---------------------------------------------------#
+#        Match taxonomic names in phylogeny       ####
+# ---------------------------------------------------#
 phylo_names_unique = unique(forest$UNTITLED$tip.label)
 
 phylo_names_matched  = lapply(phylo_names_unique, function(name){
@@ -28,6 +30,10 @@ phylo_names_matched  = lapply(phylo_names_unique, function(name){
 
 save(phylo_names_matched, file = "data/r_objects/phylo_names_matched.RData")
 
+# ---------------------------------------------------#
+#         Calculate phylo distances               ####
+# ---------------------------------------------------#
+# Calculate pairwise phylogenetic distances across a random sample 50 of forests
 # Trim forest to target species
 phylo_names_target = dplyr::filter(phylo_names_matched, name_matched %in% range_maps_names_matched$name_matched)
 
@@ -40,8 +46,6 @@ forest_pruned = lapply(forest, function(x) {
 })
 
 class(forest_pruned) <- "multiPhylo"
-
-# Calculate pairwise phylogenetic distances across a random sample 50 of forests
 indices = sample(length(forest_pruned), 50)
 dists = lapply(indices, function(i){
   cophenetic.phylo(forest_pruned[[i]])
diff --git a/R/01_04_raster_preparation.R b/R/01_04_raster_preparation.R
new file mode 100644
index 0000000000000000000000000000000000000000..aa9e21078be7373c2b25a678aecb1ae298f19c3b
--- /dev/null
+++ b/R/01_04_raster_preparation.R
@@ -0,0 +1,20 @@
+library(tidyverse)
+library(terra)
+library(rnaturalearth)
+library(furrr)
+
+# Define Study extent ####
+sa_polygon = rnaturalearth::ne_countries(scale = 10, returnclass = "sf") %>% 
+  dplyr::filter(continent == "South America") %>% 
+  sf::st_union() %>% 
+  st_crop(xmin = -82, ymin = -56, xmax = -34, ymax = 13)
+
+save(sa_polygon, file = "data/r_objects/sa_polygon.RData")
+
+# CHELSA bioclim target variables ####
+chelsa_urls = paste0("https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_bio", 1:19,"_1981-2010_V.2.1.tif")
+ 
+chelsa = terra::rast(chelsa_urls) %>% 
+  crop(sa_polygon)
+set.names(chelsa, paste0("bio", 1:19))
+terra::writeRaster(chelsa, filename = file.path("data", "geospatial", "raster", basename(chelsa_urls)), overwrite = T)
\ No newline at end of file
diff --git a/R/02_01_functional_group_assignment.R b/R/02_01_functional_group_assignment.R
deleted file mode 100644
index ee2a06f6390e6ea98475cf654867a026a385e644..0000000000000000000000000000000000000000
--- a/R/02_01_functional_group_assignment.R
+++ /dev/null
@@ -1,34 +0,0 @@
-load("data/r_objects/range_maps.RData")
-
-# Get taxonomic information for target species
-names_unique = unique(range_maps$name_matched[!is.na(range_maps$name_matched)])
-
-species_matched = lapply(names_unique, function(name){
-  match_result = Symobio::gbif_match_name(name = name)
-  if(match_result$status != "ACCEPTED"){
-    match_result = gbif_match_name(usageKey = match_result$acceptedUsageKey)
-  }
-  match_result$name_orig = name
-  return(match_result)
-}) %>% 
-  bind_rows()
-
-# Assign functional groups
-functional_groups = species_matched %>% 
-  mutate(
-    functional_group = case_when(
-      order %in% c("Carnivora", "Artiodactyla", "Cingulata", "Perissodactyla") ~ 1,
-      order %in% c("Rodentia", "Didelphimorphia", "Soricomorpha", "Paucituberculata", "Lagomorpha") ~ 2,
-      order %in% c("Primates", "Pilosa") ~ 3,
-      order %in% c("Chiroptera") ~ 4
-    ),
-    functional_group = factor(functional_group, labels = c("large ground-dwelling", "small ground-dwelling", "arboreal", "flying"))
-  ) %>% 
-  dplyr::select(
-    name_orig,
-    name_matched = species,
-    functional_group
-  ) %>% 
-  distinct()
-
-save(functional_groups, file = "data/r_objects/functional_groups.RData")
diff --git a/R/03_01_presence_preparation.R b/R/02_01_presence_data_preparation.R
similarity index 100%
rename from R/03_01_presence_preparation.R
rename to R/02_01_presence_data_preparation.R
diff --git a/R/03_02_absence_preparation.R b/R/02_02_absence_data_preparation.R
similarity index 100%
rename from R/03_02_absence_preparation.R
rename to R/02_02_absence_data_preparation.R
diff --git a/R/03_03_dataset_exploration.R b/R/03_03_dataset_exploration.R
deleted file mode 100644
index ad331734226f8f071007abe5e739d5fd83df1d4b..0000000000000000000000000000000000000000
--- a/R/03_03_dataset_exploration.R
+++ /dev/null
@@ -1,53 +0,0 @@
-library(tidyverse)
-library(sf)
-library(terra)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data.RData")
-load("data/r_objects/sa_polygon.RData")
-
-sa_polygon = sf::st_transform(sa_polygon, crs(model_data))
-
-# ---------------------------------------#
-# Plot all presences in the dataset   ####
-# ---------------------------------------#
-data_extent = ext(model_data)
-
-presences = model_data %>% 
-  dplyr::filter(present == 1)
-
-template_raster <- terra::rast(data_extent, resolution = 1000)  # 1000m = 1km
-crs(template_raster) <- st_crs(model_data)$wkt
-
-point_counts <- terra::rasterize(
-  vect(presences),          
-  template_raster,           
-  field = NULL,              
-  fun = "count",             
-  background = 0             
-) %>% 
-  terra::mask(vect(sa_polygon))
-
-length(which(values(point_counts) > 0)) 
-
-# only 36858 raster cells contain at least 1 presence record 
-
-ggplot() +
-  tidyterra::geom_spatraster(data = point_counts, maxcell = 5e7) +
-  scale_fill_gradientn(
-    colors = c("black", "yellow", "darkred"),
-    values = scales::rescale(c(0, 1, max(point_counts_df$count, na.rm = TRUE))),
-    breaks = c(0, 1, max(point_counts_df$count, na.rm = TRUE)),
-    na.value = "transparent"
-  ) +
-  geom_sf(data = sa_polygon, fill = NA, color = "gray", size = 0.5) +
-  theme_minimal() +
-  coord_sf() +
-  labs(title = "Coordinate distribution and density")
-
-ggsave("coordinate_density.pdf", path = "plots/", device = "pdf", scale = 4)
-
-# ---------------------------------------#
-# Plot all presences in the dataset   ####
-# ---------------------------------------#
diff --git a/R/03_03_model_data_finalization.R b/R/03_03_model_data_finalization.R
index 8631c4d909f21b0800356ddbd51291aace4af9c2..e0addfa3f3b88ae7499371acfa6c4def0854e20f 100644
--- a/R/03_03_model_data_finalization.R
+++ b/R/03_03_model_data_finalization.R
@@ -31,49 +31,5 @@ model_data = bind_rows(model_data_core, model_data_background) %>%
   relocate(contains("fold"), .after = last_col()) %>% 
   mutate(species = as.factor(species))
 
-# Explore folds assignment
-cv_plot(spatial_folds)
-ggsave(filename = "plots/publication/global_folds_map.pdf", device = "pdf", scale = 2.5)
-
-folds_summary = model_data %>% 
-  dplyr::filter(record_type == "core") %>% 
-  sf::st_drop_geometry() %>% 
-  dplyr::group_by(species, fold_global) %>% 
-  dplyr::summarise(
-    n_occ = length(which(present == 1)),
-    n_abs = length(which(present == 0))
-  ) 
-
-folds_uniform = folds_summary %>% 
-  dplyr::group_by(species, fold_global) %>% 
-  dplyr::summarise(
-    is_uniform = n_occ == 0 | n_abs == 0
-  ) %>% 
-  dplyr::group_by(species) %>% 
-  dplyr::summarise(
-    folds_total = n(),
-    folds_uniform = length(which(is_uniform)),
-    folds_mixed = folds_total - folds_uniform
-  ) %>% 
-  dplyr::ungroup()
-
-plot_1 = ggplot(folds_uniform, aes(x=folds_total)) + 
-  geom_histogram(binwidth=0.5, fill = "orange") + 
-  labs(title="Number of folds per species", x="Count", y = "Number of species") +
-  theme_minimal()
-plot_2 = ggplot(folds_uniform, aes(x=folds_uniform)) + 
-  geom_histogram(binwidth=0.5, fill = "violet") + 
-  labs(title="Number of uniform folds per species", x="Count", y = "Number of species",
-       subtitle="Uniform folds contain only absences or only presences and cannot be used in model evaluation.") +
-  theme_minimal()
-plot_3 = ggplot(folds_uniform, aes(x=folds_mixed)) + 
-  geom_histogram(binwidth=0.5, fill = "blue") + 
-  labs(title="Number of mixed folds per species", x="Count", y = "Number of species",
-       subtitle="Mixed folds both absences and presences and can be used in model evaluation.") +
-  theme_minimal()
-
-plot_combined = grid.arrange(plot_1, plot_2, plot_3, ncol = 1)
-ggsave(filename = "plots/publication/global_folds_histogram.pdf", plot_combined, device = "pdf", height = 10, width = 7)
-
 # Save Model data
-save(model_data, file = "data/r_objects/model_data.RData")
\ No newline at end of file
+save(model_data, file = "data/r_objects/model_data.RData")
diff --git a/R/03_04_dataset_exploration.R b/R/03_04_dataset_exploration.R
new file mode 100644
index 0000000000000000000000000000000000000000..53a627ac1929a6f054f614ac05707123e7aea6c8
--- /dev/null
+++ b/R/03_04_dataset_exploration.R
@@ -0,0 +1,95 @@
+library(tidyverse)
+library(sf)
+library(terra)
+
+source("R/utils.R")
+
+load("data/r_objects/model_data.RData")
+load("data/r_objects/sa_polygon.RData")
+
+sa_polygon = sf::st_transform(sa_polygon, crs(model_data))
+
+# ---------------------------------------#
+# Plot all presences in the dataset   ####
+# ---------------------------------------#
+data_extent = ext(model_data)
+
+presences = model_data %>% 
+  dplyr::filter(present == 1)
+
+template_raster <- terra::rast(data_extent, resolution = 1000)  # 1000m = 1km
+crs(template_raster) <- st_crs(model_data)$wkt
+
+point_counts <- terra::rasterize(
+  vect(presences),          
+  template_raster,           
+  field = NULL,              
+  fun = "count",             
+  background = 0             
+) %>% 
+  terra::mask(vect(sa_polygon))
+
+length(which(values(point_counts) > 0)) # only 36858 raster cells contain at least 1 presence record 
+
+ggplot() +
+  tidyterra::geom_spatraster(data = point_counts, maxcell = 5e7) +
+  scale_fill_gradientn(
+    colors = c("black", "yellow", "darkred"),
+    values = scales::rescale(c(0, 1, max(point_counts_df$count, na.rm = TRUE))),
+    breaks = c(0, 1, max(point_counts_df$count, na.rm = TRUE)),
+    na.value = "transparent"
+  ) +
+  geom_sf(data = sa_polygon, fill = NA, color = "gray", size = 0.5) +
+  theme_minimal() +
+  coord_sf() +
+  labs(title = "Coordinate distribution and density")
+
+ggsave("coordinate_density.pdf", path = "plots/", device = "pdf", scale = 4)
+
+# ---------------------------------------#
+# Explore folds assignment            ####
+# ---------------------------------------#
+cv_plot(spatial_folds)
+ggsave(filename = "plots/publication/global_folds_map.pdf", device = "pdf", scale = 2.5)
+
+folds_summary = model_data %>% 
+  dplyr::filter(record_type == "core") %>% 
+  sf::st_drop_geometry() %>% 
+  dplyr::group_by(species, fold_global) %>% 
+  dplyr::summarise(
+    n_occ = length(which(present == 1)),
+    n_abs = length(which(present == 0))
+  ) 
+
+folds_uniform = folds_summary %>% 
+  dplyr::group_by(species, fold_global) %>% 
+  dplyr::summarise(
+    is_uniform = n_occ == 0 | n_abs == 0
+  ) %>% 
+  dplyr::group_by(species) %>% 
+  dplyr::summarise(
+    folds_total = n(),
+    folds_uniform = length(which(is_uniform)),
+    folds_mixed = folds_total - folds_uniform
+  ) %>% 
+  dplyr::ungroup()
+
+plot_1 = ggplot(folds_uniform, aes(x=folds_total)) + 
+  geom_histogram(binwidth=0.5, fill = "orange") + 
+  labs(title="Number of folds per species", x="Count", y = "Number of species") +
+  theme_minimal()
+plot_2 = ggplot(folds_uniform, aes(x=folds_uniform)) + 
+  geom_histogram(binwidth=0.5, fill = "violet") + 
+  labs(title="Number of uniform folds per species", x="Count", y = "Number of species",
+       subtitle="Uniform folds contain only absences or only presences and cannot be used in model evaluation.") +
+  theme_minimal()
+plot_3 = ggplot(folds_uniform, aes(x=folds_mixed)) + 
+  geom_histogram(binwidth=0.5, fill = "blue") + 
+  labs(title="Number of mixed folds per species", x="Count", y = "Number of species",
+       subtitle="Mixed folds both absences and presences and can be used in model evaluation.") +
+  theme_minimal()
+
+plot_combined = grid.arrange(plot_1, plot_2, plot_3, ncol = 1)
+ggsave(filename = "plots/publication/global_folds_histogram.pdf", plot_combined, device = "pdf", height = 10, width = 7)
+
+
diff --git a/R/04_01_modelling_ssdm.R b/R/04_01_modelling_ssdm.R
index 77c25ece6da4f63726d7ca1f31bae6a490cda626..a62cd3e0efecb194d26f89ceaab31c58caf033b4 100644
--- a/R/04_01_modelling_ssdm.R
+++ b/R/04_01_modelling_ssdm.R
@@ -193,7 +193,7 @@ data_split = model_data %>%
 for(pa_spec in data_split){
   ## Preparations ####
   species = pa_spec$species[1]
-  print(species)
+  print(as.character(species))
   
   # Create inner CV folds for model training
   cv_train = blockCV::cv_spatial(
@@ -231,7 +231,7 @@ for(pa_spec in data_split){
       method = "ranger",
       trControl = train_ctrl,
       tuneLength = 8,
-      weights = data_train$weight,
+      weights = pa_spec$weight,
       verbose = F
     )
     save(rf_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_rf_fit.RData"))
@@ -246,7 +246,7 @@ for(pa_spec in data_split){
       method = "gbm",
       trControl = train_ctrl,
       tuneLength = 8,
-      weights = data_train$weight,
+      weights = pa_spec$weight,
       verbose = F
     )
     save(gbm_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gbm_fit.RData"))
@@ -259,7 +259,7 @@ for(pa_spec in data_split){
       y = pa_spec$present_fct,
       method = "gamSpline",
       tuneLength = 8,
-      weights = data_train$weight,
+      weights = pa_spec$weight,
       trControl = train_ctrl
     )
     save(gam_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gam_fit.RData"))
@@ -277,7 +277,7 @@ for(pa_spec in data_split){
       burnin = 200L,
       early_stopping = 25,
       lr = 0.001,   
-      batchsize = min(ceiling(nrow(data_train)/10), 64),
+      batchsize = min(ceiling(nrow(pa_spec)/10), 64),
       dropout = 0.25,
       optimizer = config_optimizer("adam"),
       validation = 0.2,
diff --git a/R/04_04_modelling_msdm_rf.R b/R/04_04_modelling_msdm_rf.R
index 4544b7321a86f88f54d501a9ac49e284575ca770..1e03f70098f0c9086cdd9afc515fbb3578d2704c 100644
--- a/R/04_04_modelling_msdm_rf.R
+++ b/R/04_04_modelling_msdm_rf.R
@@ -67,7 +67,7 @@ rf_fit = caret::train(
   metric = "Accuracy",
   trControl = train_ctrl,
   tuneLength = 8,
-  weights = full_data$weight,
+  weights = model_data$weight,
   num.threads = 48
 )
 
diff --git a/R/05_01_performance_report.qmd b/R/05_01_performance_report.qmd
index 8fb626b2f8adde35f089c3d5e1e3ae1826634066..38096cb26e46f1fdaf7f6917c449cf8905916bab 100644
--- a/R/05_01_performance_report.qmd
+++ b/R/05_01_performance_report.qmd
@@ -12,9 +12,9 @@ library(plotly)
 library(DT)
 
 load("../data/r_objects/model_data.RData")
-#load("../data/r_objects/ssdm_performance.RData")
+load("../data/r_objects/ssdm_performance.RData")
 load("../data/r_objects/msdm_embed_performance.RData")
-#load("../data/r_objects/msdm_onehot_performance.RData")
+load("../data/r_objects/msdm_onehot_performance.RData")
 load("../data/r_objects/msdm_rf_performance.RData")
 
 load("../data/r_objects/range_maps.RData")
@@ -55,11 +55,32 @@ loess_fit = function(x, y, span = 0.75){
   )
 }
 
+# Determine valid folds for evaluation
+# A fold is valid if there are at least n presences in the training and validation data
+obs_fold = model_data %>% 
+  sf::st_drop_geometry() %>% 
+  dplyr::filter(record_type == "core", present == 1) %>% 
+  dplyr::group_by(species, fold_global) %>% 
+  dplyr::summarise(obs_fold = n()) 
+
+obs_total = model_data %>% 
+  sf::st_drop_geometry() %>% 
+  dplyr::filter(record_type == "core", present == 1) %>% 
+  dplyr::group_by(species) %>% 
+  dplyr::summarise(obs_total = n()) 
+
+folds_valid = obs_fold %>% 
+  dplyr::left_join(obs_total) %>% 
+  dplyr::mutate(obs_train = obs_total-obs_fold) %>% 
+  dplyr::filter(obs_train > 5 & obs_fold > 5) %>% 
+  dplyr::select(species, fold_global)
+
 # Performance table
-performance = msdm_embed_performance %>% 
-  #bind_rows(msdm_embed_performance) %>% 
-  #bind_rows(msdm_onehot_performance) %>% 
+performance = ssdm_performance %>% 
+  bind_rows(msdm_embed_performance) %>% 
+  bind_rows(msdm_onehot_performance) %>% 
   bind_rows(msdm_rf_performance) %>% 
+  semi_join(folds_valid, by = c("species", "fold_global")) %>% 
   dplyr::group_by(species, model, metric) %>% 
   dplyr::summarise(value = mean(value, na.rm = T)) %>% 
   dplyr::mutate(
@@ -199,7 +220,7 @@ df_join = model_data %>%
   dplyr::summarise(n_obs = n())
 
 performance = performance %>% 
-  dplyr::left_join(df_join, by = "species") 
+  dplyr::inner_join(df_join, by = "species") 
 
 # Range size
 df_join = range_maps %>% 
@@ -377,7 +398,7 @@ Range size was calculated based on polygon layers from the IUCN Red List of Thre
 
 ```{r echo = FALSE}
 df_plot = dplyr::filter(performance, metric == "auc")
-plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_size", x_label = "Range size [km]")
+plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_size", x_label = "Range size [km²]")
 bslib::card(plot, full_screen = T)
 ```
 
@@ -385,7 +406,7 @@ bslib::card(plot, full_screen = T)
 
 ```{r echo = FALSE}
 df_plot = dplyr::filter(performance, metric == "f1")
-plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_size", x_label = "Range size [km]")
+plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_size", x_label = "Range size [km²]")
 bslib::card(plot, full_screen = T)
 ```
 
@@ -393,7 +414,7 @@ bslib::card(plot, full_screen = T)
 
 ```{r echo = FALSE}
 df_plot = dplyr::filter(performance, metric == "kappa")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_size", x_label = "Range size [km]")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_size", x_label = "Range size [km²]")
 bslib::card(plot, full_screen = T)
 ```
 
@@ -401,7 +422,7 @@ bslib::card(plot, full_screen = T)
 
 ```{r echo = FALSE}
 df_plot = dplyr::filter(performance, metric == "accuracy")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_size", x_label = "Range size [km]")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_size", x_label = "Range size [km²]")
 bslib::card(plot, full_screen = T)
 ```
 
@@ -409,7 +430,7 @@ bslib::card(plot, full_screen = T)
 
 ```{r echo = FALSE}
 df_plot = dplyr::filter(performance, metric == "precision")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_size", x_label = "Range size [km]")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_size", x_label = "Range size [km²]")
 bslib::card(plot, full_screen = T)
 ```
 
@@ -417,7 +438,7 @@ bslib::card(plot, full_screen = T)
 
 ```{r echo = FALSE}
 df_plot = dplyr::filter(performance, metric == "recall")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_size", x_label = "Range size [km]")
+plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_size", x_label = "Range size [km²]")
 bslib::card(plot, full_screen = T)
 ```
 :::
@@ -430,7 +451,7 @@ bslib::card(plot, full_screen = T)
 ```{r echo = FALSE}
 df_plot = dplyr::filter(performance, metric == "auc") %>% 
   add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_size", x_label = "Range size [km]")
+plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_size", x_label = "Range size [km²]")
 bslib::card(plot, full_screen = T)
 ```
 
diff --git a/R/05_01_performance_report_rf.qmd b/R/05_01_performance_report_rf.qmd
deleted file mode 100644
index 534a5f8924df3f579029fbc76be7c72bba0db14b..0000000000000000000000000000000000000000
--- a/R/05_01_performance_report_rf.qmd
+++ /dev/null
@@ -1,692 +0,0 @@
----
-title: "SDM Performance report"
-format: html
-editor: visual
-engine: knitr
----
-
-```{r init, echo = FALSE, include = FALSE}
-library(tidyverse)
-library(sf)
-library(plotly)
-library(DT)
-
-load("../data/r_objects/model_data.RData")
-load("../data/r_objects/range_maps.RData")
-load("../data/r_objects/range_maps_gridded.RData")
-load("../data/r_objects/occs_final.RData")
-load("../data/r_objects/functional_groups.RData")
-
-sf::sf_use_s2(use_s2 = FALSE)
-
-# Load performance of different RF versions
-load("../data/r_objects/deprecated/msdm_rf_no_species_random_abs_performance.RData")
-load("../data/r_objects/deprecated/msdm_rf_no_species_performance.RData")
-load("../data/r_objects/msdm_rf_performance.RData")
-
-msdm_rf_no_species_performance$model = "rf_noSpec_oldVars_informedAbs"
-msdm_rf_no_species_performance$metric = tolower(msdm_rf_no_species_performance$metric)
-msdm_rf_no_species_random_abs_performance$model = "rf_noSpec_oldVars_randomAbs"
-msdm_rf_no_species_random_abs_performance$metric = tolower(msdm_rf_no_species_random_abs_performance$metric)
-msdm_rf_performance$model = "rf_noSpec_newVars_randomAbs"
-```
-
-```{r globals, echo = FALSE, cache = TRUE, include = FALSE}
-# Regression functions
-asym_fit = function(x, y){
-  nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1))
-  new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100))
-  data.frame(
-    x = new_x,
-    fit = predict(nls_fit, newdata = data.frame(x = new_x))
-  )
-}
-
-glm_fit = function(x, y, family = "binomial"){
-  glm_fit = suppressWarnings(glm(y~x, family = family))
-  new_x = seq(min(x), max(x), length.out = 100)
-  data.frame(
-    x = new_x,
-    fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response")
-  )
-}
-
-loess_fit = function(x, y, span = 0.75){
-  df = data.frame(x = x, y = y)
-  loess_fit = loess(y ~ x, data = df, span = span)
-  new_x = seq(min(x), max(x), length.out = 100)
-  data.frame(
-    x = new_x,
-    fit = predict(loess_fit, newdata = data.frame(x = new_x))
-  )
-}
-
-# Performance table
-performance = msdm_rf_performance %>% 
-  bind_rows(msdm_rf_no_species_performance) %>% 
-  bind_rows(msdm_rf_no_species_random_abs_performance) %>% 
-  dplyr::group_by(species, model, metric) %>% 
-  dplyr::summarise(value = mean(value, na.rm = F)) %>% 
-  dplyr::mutate(
-    value = case_when(
-      ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5,
-      ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0,
-      .default = value
-    )
-  ) %>% 
-  ungroup()
-
-plot_performance = function(df_plot, y_var, y_label, x_var, x_label, x_log = T, reg_func = loess_fit) {
-  df_plot = df_plot %>% 
-    dplyr::rename(
-      x_var = !!x_var,
-      y_var = !!y_var
-    )
-  
-  # Calculate regression lines for each model and metric combination
-  suppressWarnings({
-    regression_lines = df_plot %>%
-      group_by(model) %>%
-      group_modify( ~ reg_func(.x[["x_var"]], .x[["y_var"]]))
-  })
-  
-  # Create base plot
-  plot <- plot_ly() %>%
-    layout(
-      title = paste0("Model Performance vs. ", x_label),
-      xaxis = list(title = x_label, type = ifelse(x_log, "log", "linear")),
-      yaxis = list(title = y_label),
-      legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot
-      margin = list(r = 150), # Add right margin to accommodate legend
-      hovermode = 'closest'
-    )
-  
-  # Points
-  for (model_name in unique(df_plot$model)) {
-    plot = plot %>%
-      add_markers(
-        data = filter(df_plot, model == model_name),
-        x = ~ x_var,
-        y = ~ y_var,
-        color = model_name, # Set color to match legendgroup
-        legendgroup = model_name,
-        opacity = 0.6,
-        name = ~ model,
-        hoverinfo = 'text',
-        text = ~ paste(
-          "Species:", species, "<br>", x_label, ":", x_var, "<br>Value:", round(y_var, 3)
-        )
-      )
-  }
-  
-  # Add regression lines
-  for (model_name in unique(df_plot$model)) {
-    reg_data = dplyr::filter(regression_lines, model == model_name)
-    plot = plot %>%
-      add_lines(
-        data = reg_data,
-        x = ~ x,
-        y = ~ fit,
-        color = model_name, # Set color to match legendgroup
-        legendgroup = model_name,
-        name = paste(model_name, '(fit)'),
-        showlegend = FALSE
-      )
-  }
-  
-  return(plot)
-}
-
-add_partial_residuals = function(df, focus_var, control_vars, family = quasibinomial) {
-  model_formula <- as.formula(paste("value ~", paste(c(focus_var, control_vars), collapse = " + ")))
-  model = glm(model_formula, data = df, family = family)
-  
-  if (focus_var %in% names(model$coefficients)) {
-    df[[paste0(focus_var, "_partial")]] = residuals(model, type = "response") + model$coefficients[focus_var] * df[[focus_var]]
-  }
-  
-  return(df)
-}
-
-```
-
-## Summary
-
-This document contrasts the performance of different variations of random forest models. 
-
-## Analysis
-
-### Quantify drivers of model performance
-
-```{r prepare_model_df, echo = FALSE, include = FALSE}
-# Number of records
-df_join = model_data %>% 
-  sf::st_drop_geometry() %>% 
-  dplyr::filter(present == 1) %>% 
-  dplyr::group_by(species) %>% 
-  dplyr::summarise(n_obs = n())
-
-performance = performance %>% 
-  dplyr::left_join(df_join, by = "species") 
-
-# Range size
-df_join = range_maps %>% 
-  sf::st_transform("+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs") %>% 
-  dplyr::mutate(range_size = as.numeric(sf::st_area(range_maps) / 1000000)) %>%  # range in sqkm
-  sf::st_drop_geometry()
-
-performance = performance %>% 
-  inner_join(df_join, by = c("species" = "name_matched"))
-
-# Range coverage
-range_maps_gridded = range_maps_gridded %>%
-  sf::st_transform("+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs")
-
-df_cells_total = range_maps_gridded %>%
-  dplyr::rename("species" = name_matched) %>% 
-  group_by(species) %>%
-  summarise(cells_total = n()) %>% 
-  st_drop_geometry()
-
-df_cells_occ = range_maps_gridded %>% 
-  st_join(occs_final, join = st_intersects) %>%
-  filter(name_matched == species) %>%         # Filter only intersections of the same species
-  group_by(species) %>%
-  summarise(cells_occupied = n_distinct(geometry)) %>%
-  st_drop_geometry()
-
-df_join = df_cells_total %>% 
-  dplyr::inner_join(df_cells_occ, by = "species") %>% 
-  dplyr::mutate(range_cov = cells_occupied / cells_total) %>% 
-  dplyr::select(species, range_cov)
-
-performance = performance %>% 
-  inner_join(df_join, by = "species")
-
-# Range coverage bias
-df_occs_total = occs_final %>% 
-  st_drop_geometry() %>% 
-  group_by(species) %>% 
-  summarise(occs_total = n())
-
-df_join = df_occs_total %>% 
-  dplyr::inner_join(df_cells_total, by = "species") %>% 
-  dplyr::inner_join(df_cells_occ, by = "species") %>% 
-  dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1))) %>%
-  dplyr::select(species, range_bias)
-
-performance = performance %>% 
-  inner_join(df_join, by = "species")
-```
-
-### Number of records
-
-::: panel-tabset
-#### Marginal Effects
-
-::: panel-tabset
-##### AUC
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "auc")
-plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-
-##### F1
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "f1")
-plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-
-##### Cohen's kappa
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "kappa")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-
-##### Accuracy
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "accuracy")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-
-##### Precision
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "precision")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-
-##### Recall
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "recall")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-:::
-
-#### Partial Effects
-
-::: panel-tabset
-##### AUC
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "auc") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-
-##### F1
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "f1") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-
-##### Cohen's kappa
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "kappa") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian)
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-
-##### Accuracy
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "accuracy") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-
-##### Precision
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "precision") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-
-##### Recall
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "recall") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "n_obs", x_label = "Number of records")
-bslib::card(plot, full_screen = T)
-```
-:::
-:::
-
-### Range size
-
-Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016).
-
-::: panel-tabset
-#### Marginal Effects
-
-::: panel-tabset
-##### AUC
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "auc")
-plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_size", x_label = "Range size [km]")
-bslib::card(plot, full_screen = T)
-```
-
-##### F1
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "f1")
-plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_size", x_label = "Range size [km]")
-bslib::card(plot, full_screen = T)
-```
-
-##### Cohen's kappa
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "kappa")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_size", x_label = "Range size [km]")
-bslib::card(plot, full_screen = T)
-```
-
-##### Accuracy
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "accuracy")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_size", x_label = "Range size [km]")
-bslib::card(plot, full_screen = T)
-```
-
-##### Precision
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "precision")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_size", x_label = "Range size [km]")
-bslib::card(plot, full_screen = T)
-```
-
-##### Recall
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "recall")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_size", x_label = "Range size [km]")
-bslib::card(plot, full_screen = T)
-```
-:::
-
-#### Partial Effects
-
-::: panel-tabset
-##### AUC
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "auc") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_size", x_label = "Range size [km]")
-bslib::card(plot, full_screen = T)
-```
-
-##### F1
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "f1") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_size", x_label = "Range size [km²]")
-bslib::card(plot, full_screen = T)
-```
-
-##### Cohen's kappa
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "kappa") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian)
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_size", x_label = "Range size [km²]")
-bslib::card(plot, full_screen = T)
-```
-
-##### Accuracy
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "accuracy") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_size", x_label = "Range size [km²]")
-bslib::card(plot, full_screen = T)
-```
-
-##### Precision
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "precision") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_size", x_label = "Range size [km²]")
-bslib::card(plot, full_screen = T)
-```
-
-##### Recall
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "recall") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_size", x_label = "Range size [km²]")
-bslib::card(plot, full_screen = T)
-```
-:::
-:::
-
-### Range coverage
-
-Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells.
-
-$$
-RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}}
-$$
-
-::: panel-tabset
-#### Marginal Effects
-
-::: panel-tabset
-##### AUC
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "auc")
-plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-
-##### F1
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "f1")
-plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-
-##### Cohen's kappa
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "kappa")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-
-##### Accuracy
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "accuracy")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-
-##### Precision
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "precision")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-
-##### Recall
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "recall")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-:::
-
-#### Partial Effects
-
-::: panel-tabset
-##### AUC
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "auc") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-
-##### F1
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "f1") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-
-##### Cohen's kappa
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "kappa") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian)
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-
-##### Accuracy
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "accuracy") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-
-##### Precision
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "precision") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-
-##### Recall
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "recall") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_cov", x_label = "Range coverage")
-bslib::card(plot, full_screen = T)
-```
-:::
-:::
-
-### Range coverage bias
-
-Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range.
-
-$$
-RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)}
-$$
-
-Higher bias values indicate that occurrence records are spatially more clustered within the range of the species.
-
-::: panel-tabset
-#### Marginal Effects
-
-::: panel-tabset
-##### AUC
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "auc")
-plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-
-##### F1
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "f1")
-plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-
-##### Cohen's kappa
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "kappa")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-
-##### Accuracy
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "accuracy")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-
-##### Precision
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "precision")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-
-##### Recall
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "recall")
-plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-:::
-
-#### Partial Effects
-
-::: panel-tabset
-##### AUC
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "auc") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-
-##### F1
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "f1") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-
-##### Cohen's kappa
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "kappa") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian)
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-
-##### Accuracy
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "accuracy") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-
-##### Precision
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "precision") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-
-##### Recall
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(performance, metric == "recall") %>% 
-  add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"))
-plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_bias", x_label = "Range coverage bias")
-bslib::card(plot, full_screen = T)
-```
-:::
-:::
diff --git a/R/05_02_publication_analysis.R b/R/05_02_publication_analysis.R
index 4d347c4c53455fa3734dc3076e163aa68d0d2115..8f9bc5efb6235c1057daaa660651571863965df2 100644
--- a/R/05_02_publication_analysis.R
+++ b/R/05_02_publication_analysis.R
@@ -242,10 +242,10 @@ raster_data = terra::rast(raster_filepaths) %>%
   terra::crop(sf::st_bbox(sa_polygon)) %>% 
   terra::project(sf::st_crs(model_data)$input)
 
-specs = c("Histiotus velatus", "Hydrochoerus hydrochaeris", "Euryoryzomys legatus", "Eumops trumbulli")
+specs = c("Aotus lemurinus")
 for(spec in specs){
   pdf(file = paste0("plots/range_predictions/", spec, ".pdf"), width = 12)
-  plots = plot_predictions(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "msdm_onehot", "msdm_embed", "msdm_rf"))
+  plots = plot_predictions(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf"))
   lapply(plots, print)
   dev.off()
 }
diff --git a/R/_publish.yml b/R/_publish.yml
index 5604c706877eb12f339edad6480b654f76b4eab2..1da1e8787c9b0c9b22c24ac552ae323c00102459 100644
--- a/R/_publish.yml
+++ b/R/_publish.yml
@@ -2,8 +2,6 @@
   quarto-pub:
     - id: 77b30762-b473-447d-be18-d4bbd6261fbf
       url: 'https://chrkoenig.quarto.pub/sdm-performance-report'
-    - id: 7777d450-82af-4364-b4c1-b52d7139ade0
-      url: 'https://chrkoenig.quarto.pub/rf-nospec-performance-report'
 - source: 05_01_performance_report_rf.qmd
   quarto-pub:
     - id: 98e776a5-fb0b-41f9-8733-446bf7ffb272
diff --git a/README.md b/README.md
index aaaa50cd194bb8d2807ab511b9ca8845a4352836..a26b7475873243a602c58480864a3895bfd9033c 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,62 @@
-# Symobio-modeling
+# Codebase Documentation
 
-Code and data accompanying SSDM vs MSDM model comparison
\ No newline at end of file
+This repository implements a species distribution modeling comparison study for about 600 South American mammal species. Specifically, the 
+
+## Project Structure
+
+- **`R/`**: Contains all the R scripts organized by workflow steps.
+- **`Symobio_modeling.Rproj`**: RStudio project file for easy navigation.
+- **`README.md`**: High-level overview of the project.
+- **`renv/`**: Manages package dependencies for reproducibility.
+- **`renv.lock`**: Lockfile for `renv` to ensure consistent package versions.
+
+## Workflow Overview
+
+The workflow is divided into several stages, each represented by scripts in the `R/` directory. Below is a summary of the key steps:
+
+### 1. Preparation of Geographic Data
+- **`01_01_range_map_preparation.R`**: Processes IUCN mammal range maps for South America, converting them to a standardized raster format with consistent projection for downstream analysis.
+- **`01_02_raster_preparation.R`**: Prepares environmental predictor rasters (e.g., climate, elevation, land cover) by cropping to South America extent, resampling to consistent resolution, and performing any necessary transformations.
+
+### 2. Preparation of complementary species-level data
+
+
+- **`02_01_functional_group_assignment.R`**: Assigns mammal species to functional groups based on diet, locomotion, and body size characteristics, creating categorical variables for modeling.
+- **`02_02_functional_traits_preparation.R`**: Cleans and standardizes continuous trait data (body mass, diet breadth, etc.) for all study species, handling missing values through imputation where necessary.
+- **`02_03_phylo_preparation.R`**: Extracts phylogenetic information for target mammal species, computes phylogenetic distance matrices, and prepares the data for inclusion in models.
+
+### 3. Preparation of Presence/Absence Data
+- **`03_01_presence_preparation.R`**: Processes occurrence records from GBIF and other sources, applies spatial filtering to reduce sampling bias, and aligns taxonomic nomenclature.
+- **`03_02_absence_preparation.R`**: Generates pseudo-absence points using a stratified random approach, with constraints based on environmental conditions and range map boundaries.
+- **`03_03_dataset_exploration.R`**: Produces descriptive statistics and visualizations of presence/absence data, environmental variables, and species coverage to assess data quality.
+- **`03_04_model_data_finalization.R`**: Merges all prepared datasets (occurrences, absences, predictors) into final modeling datasets, splits data into training/testing sets, and applies any necessary scaling or transformations.
+
+### 4. Modeling
+- **`04_01_modelling_ssdm.R`**: Implements traditional single-species distribution modeling (SSDM) approaches with selected algorithms (e.g., MaxEnt, random forests), including hyperparameter tuning.
+- **`04_02_modelling_msdm_embed.R`**: Develops multi-species distribution models using neural network approaches that embed species identities into a latent space, capturing inter-species relationships.
+- **`04_03_modelling_msdm_onehot.R`**: Implements multi-species distribution models using one-hot encoding for species identities, enabling joint prediction across all species simultaneously.
+- **`04_04_modelling_msdm_rf.R`**: Implements random forest-based multi-species distribution modeling, incorporating species identity as a predictor variable alongside environmental variables.
+
+### 5. Analysis
+- **`05_01_performance_report.qmd`**: Generates comprehensive reports on model performance metrics (AUC, TSS, etc.) for all modeling approaches, with visualizations comparing performance across species and methods.
+- **`05_02_publication_analysis.qmd`**: Conducts advanced statistical analyses of model results, creates publication-quality figures, and summarizes findings for manuscript preparation.
+
+## Miscellaneous
+- **`utils.R`**: Contains utility functions used across multiple scripts, including data processing helpers, custom evaluation metrics, and visualization functions. 
+
+### Miscellaneous
+- **`utils.R`**: 
+
+## Getting Started
+
+1. Clone the repository and open the `Symobio_modeling.Rproj` file in RStudio.
+2. Restore the project environment using `renv`:
+   ```r
+   renv::restore()
+   ```
+3. Run the scripts in the R/ directory sequentially. Some scripts, especially for model fitting, may run a long time and benefit from powerful hardware.
+
+## Additional Notes
+- Ensure that all required input data (e.g., range maps, raster files) is available in the expected directories. 
+- Outputs from each script are typically saved to disk and used as inputs for subsequent scripts.
+- Refer to the README.md file for any additional project-specific instructions.
diff --git a/occurrences.png b/occurrences.png
deleted file mode 100644
index 0a8e43d1943357ed699740edbce1198cfddb6b33..0000000000000000000000000000000000000000
Binary files a/occurrences.png and /dev/null differ