From 3ed7e5fdd5debfaf5df9be169d344a2afd9660b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Fri, 21 Mar 2025 12:24:00 +0100
Subject: [PATCH] Finished file restructuring + improved documentation

---
 R/01_01_range_preparation.R                    | 13 +++++++++----
 R/01_02_traits_preparation.R                   | 18 ++++++++++++------
 R/01_03_phylo_preparation.R                    |  6 +++---
 R/01_04_raster_preparation.R                   |  4 ++--
 R/02_01_presence_data_preparation.R            | 10 +++++-----
 ...ation.R => 02_03_model_data_finalization.R} |  0
 ...ploration.R => 02_04_dataset_exploration.R} |  7 ++++++-
 ...modelling_ssdm.R => 03_01_modelling_ssdm.R} |  2 +-
 ...dm_embed.R => 03_02_modelling_msdm_embed.R} |  0
 ..._onehot.R => 03_03_modelling_msdm_onehot.R} |  0
 ...ing_msdm_rf.R => 03_04_modelling_msdm_rf.R} |  0
 ...report.qmd => 04_01_performance_report.qmd} |  0
 ...analysis.R => 04_02_publication_analysis.R} |  0
 13 files changed, 38 insertions(+), 22 deletions(-)
 rename R/{03_03_model_data_finalization.R => 02_03_model_data_finalization.R} (100%)
 rename R/{03_04_dataset_exploration.R => 02_04_dataset_exploration.R} (95%)
 rename R/{04_01_modelling_ssdm.R => 03_01_modelling_ssdm.R} (98%)
 rename R/{04_02_modelling_msdm_embed.R => 03_02_modelling_msdm_embed.R} (100%)
 rename R/{04_03_modelling_msdm_onehot.R => 03_03_modelling_msdm_onehot.R} (100%)
 rename R/{04_04_modelling_msdm_rf.R => 03_04_modelling_msdm_rf.R} (100%)
 rename R/{05_01_performance_report.qmd => 04_01_performance_report.qmd} (100%)
 rename R/{05_02_publication_analysis.R => 04_02_publication_analysis.R} (100%)

diff --git a/R/01_01_range_preparation.R b/R/01_01_range_preparation.R
index 4f6a412..487d756 100644
--- a/R/01_01_range_preparation.R
+++ b/R/01_01_range_preparation.R
@@ -7,7 +7,7 @@ library(Rfast)
 sf::sf_use_s2(use_s2 = FALSE)
 
 # --------------------------------------#
-#       Process range maps           ####
+# Process range maps                 ####
 # --------------------------------------#
 # Load IUCN range maps
 range_maps = st_read(
@@ -52,7 +52,7 @@ range_maps = range_maps %>%
 save(range_maps, file = "data/r_objects/range_maps.RData")
 
 # -------------------------------------------#
-#       Process gridded range maps        ####
+# Process gridded range maps              ####
 # -------------------------------------------#
 range_maps_gridded = rnaturalearth::ne_countries() %>% 
   dplyr::filter(continent == "South America") %>% 
@@ -65,10 +65,11 @@ range_maps_gridded = rnaturalearth::ne_countries() %>%
 save(range_maps_gridded, file = "data/r_objects/range_maps_gridded.RData")
 
 # ----------------------------------------------#
-#       Calculate range dissimilarity        ####
+# Calculate range dissimilarity              ####
 # ----------------------------------------------#
 load("data/r_objects/range_maps_gridded.RData")
 
+# Get grid ids
 range_maps_gridded_id = range_maps_gridded %>% 
   dplyr::group_by(geometry) %>%
   dplyr::mutate(geom_id = cur_group_id()) %>% 
@@ -79,6 +80,7 @@ geometries_unique = range_maps_gridded_id %>%
   group_by(geom_id) %>% 
   slice_head(n = 1)
 
+# Calculate pairwise distances between grid cells
 geom_dist = sf::st_distance(geometries_unique, geometries_unique)  %>%  # Takes ~ 10 mins
   as.matrix()
 
@@ -86,6 +88,7 @@ range_maps_split = range_maps_gridded_id %>%
   group_by(name_matched) %>% 
   group_split()
 
+# Calculate mean minimum distance between grid cells of species A and B
 mean_minimum_distances = lapply(range_maps_split, function(df1){  # Takes ~ 30 mins
   df_dist = lapply(range_maps_split, function(df2){
     dists_subset = geom_dist[df1$geom_id, df2$geom_id, drop = F]
@@ -103,11 +106,13 @@ mean_minimum_distances = lapply(range_maps_split, function(df1){  # Takes ~ 30 m
   return(df_dist)
 }) %>% bind_rows()
 
+# Reshape into distance matrix
 range_dist = mean_minimum_distances %>% 
   pivot_wider(names_from = species2, values_from = dist) %>% 
   column_to_rownames("species1") %>% 
   as.matrix()
 
-range_dist = range_dist / max(range_dist) # Scale to (0,1)
+# Scale to (0,1)
+range_dist = range_dist / max(range_dist) 
 
 save(range_dist, file = "data/r_objects/range_dist.RData")
diff --git a/R/01_02_traits_preparation.R b/R/01_02_traits_preparation.R
index 0e48083..c594b42 100644
--- a/R/01_02_traits_preparation.R
+++ b/R/01_02_traits_preparation.R
@@ -44,7 +44,7 @@ save(functional_groups, file = "data/r_objects/functional_groups.RData")
 # ---------------------------------------------------#
 #        Process and merge functional traits      ####
 # ---------------------------------------------------#
-# Match names
+# Match taxonomic names
 traits = traitdata::elton_mammals
 names_unique = unique(traits$scientificNameStd)
 
@@ -84,21 +84,26 @@ traits_proc = traits_matched %>%
          match_epithet = stringr::str_detect(species, Species)) %>% 
   dplyr::group_by(species) %>% 
   dplyr::group_modify(~ {
+    # Only one record with accepted species name
     if (nrow(.x) == 1) { 
       return (.x)
     }
+    # If multiple records for accepted species name
     x_mod = .x
     while (nrow(x_mod) > 1){
+      # 1. Remove records with mismatched genus name, retry
       if(any(isFALSE(x_mod$match_genus))){
         x_mod = dplyr::filter(x_mod, isTRUE(match_genus))
         next
       }
       
+      # 1. Remove records with mismatched epithet name, retry
       if(any(isFALSE(x_mod$match_epithet))){
         x_mod = dplyr::filter(x_mod, isTRUE(match_epithet))
         next
       }
       
+      # If there are still multiple records with matching genus and epithet, just take the first record
       x_mod = x_mod[1,]
     }
     
@@ -108,7 +113,7 @@ traits_proc = traits_matched %>%
       return(.x[1,])
     }
   }) %>% 
-  dplyr::select(c("species", diet_cols, strata_cols, activity_cols, bodymass_cols))
+  dplyr::select(all_of(c("species", diet_cols, strata_cols, activity_cols, bodymass_cols)))
 
 save(traits_proc, file = "data/r_objects/traits_proc.RData")
 
@@ -122,10 +127,11 @@ traits_target = dplyr::filter(traits_proc, species %in% target_species)
 traits_target$`ForStrat.Value` = as.numeric(factor(traits_target$`ForStrat.Value`, levels = c("G", "S", "Ar", "A")))
 traits_target$`BodyMass.Value` = scale(log(traits_target$`BodyMass.Value`))
 
-diet_dist = vegan::vegdist(traits_target[,diet_cols], "bray")
-foraging_dist = dist(traits_target[,strata_cols])
-activity_dist = vegan::vegdist(traits_target[,activity_cols], "bray")
-bodymass_dist = dist(traits_target[,bodymass_cols])
+# Calculate distances for trait categories separately
+diet_dist = vegan::vegdist(traits_target[,diet_cols], "bray")   # Bray-Curtis
+foraging_dist = dist(traits_target[,strata_cols])               # Euclidean
+activity_dist = vegan::vegdist(traits_target[,activity_cols], "bray")   # Bray-Curtis
+bodymass_dist = dist(traits_target[,bodymass_cols])             # Euclidean
 
 func_dist = (diet_dist + foraging_dist/max(foraging_dist) + activity_dist + bodymass_dist/max(bodymass_dist)) / 4
 names(func_dist) = traits_target$species
diff --git a/R/01_03_phylo_preparation.R b/R/01_03_phylo_preparation.R
index 4ba9a85..806e4f6 100644
--- a/R/01_03_phylo_preparation.R
+++ b/R/01_03_phylo_preparation.R
@@ -7,7 +7,7 @@ forest = read.nexus("data/phylogenies/Complete_phylogeny.nex")
 load("data/r_objects/range_maps_names_matched.RData")
 
 # ---------------------------------------------------#
-#        Match taxonomic names in phylogeny       ####
+# Match taxonomic names in phylogeny              ####
 # ---------------------------------------------------#
 phylo_names_unique = unique(forest$UNTITLED$tip.label)
 
@@ -31,9 +31,8 @@ phylo_names_matched  = lapply(phylo_names_unique, function(name){
 save(phylo_names_matched, file = "data/r_objects/phylo_names_matched.RData")
 
 # ---------------------------------------------------#
-#         Calculate phylo distances               ####
+# Calculate phylo distances                      ####
 # ---------------------------------------------------#
-# Calculate pairwise phylogenetic distances across a random sample 50 of forests
 # Trim forest to target species
 phylo_names_target = dplyr::filter(phylo_names_matched, name_matched %in% range_maps_names_matched$name_matched)
 
@@ -45,6 +44,7 @@ forest_pruned = lapply(forest, function(x) {
   return(x_pruned)
 })
 
+# Calculate pairwise phylogenetic distances across a random sample 50 of forests
 class(forest_pruned) <- "multiPhylo"
 indices = sample(length(forest_pruned), 50)
 dists = lapply(indices, function(i){
diff --git a/R/01_04_raster_preparation.R b/R/01_04_raster_preparation.R
index aa9e210..fdee825 100644
--- a/R/01_04_raster_preparation.R
+++ b/R/01_04_raster_preparation.R
@@ -1,13 +1,13 @@
 library(tidyverse)
 library(terra)
 library(rnaturalearth)
-library(furrr)
+library(sf)
 
 # Define Study extent ####
 sa_polygon = rnaturalearth::ne_countries(scale = 10, returnclass = "sf") %>% 
   dplyr::filter(continent == "South America") %>% 
   sf::st_union() %>% 
-  st_crop(xmin = -82, ymin = -56, xmax = -34, ymax = 13)
+  sf::st_crop(xmin = -82, ymin = -56, xmax = -34, ymax = 13)
 
 save(sa_polygon, file = "data/r_objects/sa_polygon.RData")
 
diff --git a/R/02_01_presence_data_preparation.R b/R/02_01_presence_data_preparation.R
index 7a2697e..48237cd 100644
--- a/R/02_01_presence_data_preparation.R
+++ b/R/02_01_presence_data_preparation.R
@@ -14,11 +14,11 @@ library(DBI)
 
 source("R/utils.R")
 
-con = db_connect() # Connection to Symobio
-sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry
+con = db_connect() # Connection to Symobio DB
+# sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry
 
 # ---------------------------------------------------------------------------#
-# Prepare Geodata                                                         ####
+# Load Geodata                                                            ####
 # ---------------------------------------------------------------------------#
 load("data/r_objects/sa_polygon.RData")
 
@@ -34,7 +34,7 @@ load("data/r_objects/range_maps.RData")
 target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)])
 
 # Query species from Symobio
-occs = tbl(con, "species_occurrences") %>% 
+occs = tbl(con, "species_occurrences") %>% # This line connects to the "species_occurrences" table in SymobioDB
   dplyr::filter(species %in% target_species) %>% 
   dplyr::select(-year) %>% 
   dplyr::distinct() %>% 
@@ -69,7 +69,7 @@ occs_final = occs_flagged %>%
   dplyr::select(species, longitude, latitude)
 
 # Extract environmental data
-proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" 
+proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" # Equal Area projection for South America, created with https://projectionwizard.org/
 
 raster_data = terra::project(raster_data, proj_string)
 occs_final = st_transform(occs_final, proj_string)
diff --git a/R/03_03_model_data_finalization.R b/R/02_03_model_data_finalization.R
similarity index 100%
rename from R/03_03_model_data_finalization.R
rename to R/02_03_model_data_finalization.R
diff --git a/R/03_04_dataset_exploration.R b/R/02_04_dataset_exploration.R
similarity index 95%
rename from R/03_04_dataset_exploration.R
rename to R/02_04_dataset_exploration.R
index 53a627a..01d3632 100644
--- a/R/03_04_dataset_exploration.R
+++ b/R/02_04_dataset_exploration.R
@@ -12,6 +12,7 @@ sa_polygon = sf::st_transform(sa_polygon, crs(model_data))
 # ---------------------------------------#
 # Plot all presences in the dataset   ####
 # ---------------------------------------#
+# Prepare data
 data_extent = ext(model_data)
 
 presences = model_data %>% 
@@ -29,8 +30,10 @@ point_counts <- terra::rasterize(
 ) %>% 
   terra::mask(vect(sa_polygon))
 
-length(which(values(point_counts) > 0)) # only 36858 raster cells contain at least 1 presence record 
+length(which(values(point_counts) > 0)) 
+# --> only 36858 raster cells contain at least 1 presence record 
 
+# Create plot
 ggplot() +
   tidyterra::geom_spatraster(data = point_counts, maxcell = 5e7) +
   scale_fill_gradientn(
@@ -49,6 +52,7 @@ ggsave("coordinate_density.pdf", path = "plots/", device = "pdf", scale = 4)
 # ---------------------------------------#
 # Explore folds assignment            ####
 # ---------------------------------------#
+# Prepare data
 cv_plot(spatial_folds)
 ggsave(filename = "plots/publication/global_folds_map.pdf", device = "pdf", scale = 2.5)
 
@@ -74,6 +78,7 @@ folds_uniform = folds_summary %>%
   ) %>% 
   dplyr::ungroup()
 
+# Create plot
 plot_1 = ggplot(folds_uniform, aes(x=folds_total)) + 
   geom_histogram(binwidth=0.5, fill = "orange") + 
   labs(title="Number of folds per species", x="Count", y = "Number of species") +
diff --git a/R/04_01_modelling_ssdm.R b/R/03_01_modelling_ssdm.R
similarity index 98%
rename from R/04_01_modelling_ssdm.R
rename to R/03_01_modelling_ssdm.R
index a62cd3e..d21855d 100644
--- a/R/04_01_modelling_ssdm.R
+++ b/R/03_01_modelling_ssdm.R
@@ -24,7 +24,7 @@ data_split = model_data %>%
 # ----------------------------------------------------------------------#
 # Set up parallel processing                                         ####
 # ----------------------------------------------------------------------#
-num_cores <- 2
+num_cores <- 2 # Some packages use internal parallelization, so be careful with increasing this param
 cl <- makeCluster(num_cores)
 registerDoParallel(cl)
 
diff --git a/R/04_02_modelling_msdm_embed.R b/R/03_02_modelling_msdm_embed.R
similarity index 100%
rename from R/04_02_modelling_msdm_embed.R
rename to R/03_02_modelling_msdm_embed.R
diff --git a/R/04_03_modelling_msdm_onehot.R b/R/03_03_modelling_msdm_onehot.R
similarity index 100%
rename from R/04_03_modelling_msdm_onehot.R
rename to R/03_03_modelling_msdm_onehot.R
diff --git a/R/04_04_modelling_msdm_rf.R b/R/03_04_modelling_msdm_rf.R
similarity index 100%
rename from R/04_04_modelling_msdm_rf.R
rename to R/03_04_modelling_msdm_rf.R
diff --git a/R/05_01_performance_report.qmd b/R/04_01_performance_report.qmd
similarity index 100%
rename from R/05_01_performance_report.qmd
rename to R/04_01_performance_report.qmd
diff --git a/R/05_02_publication_analysis.R b/R/04_02_publication_analysis.R
similarity index 100%
rename from R/05_02_publication_analysis.R
rename to R/04_02_publication_analysis.R
-- 
GitLab