Finished file restructuring + improved documentation

3ed7e5fd · ye87zine · 9f7c6efb · 3ed7e5fd · 3ed7e5fd · 3ed7e5fd
Commit 3ed7e5fd authored 4 months ago by ye87zine
--- a/R/01_01_range_preparation.R
+++ b/R/01_01_range_preparation.R
@@ -7,7 +7,7 @@ library(Rfast)
 sf::sf_use_s2(use_s2 = FALSE)
 # --------------------------------------#
-#       Process range maps           ####
+# Process range maps                 ####
 # --------------------------------------#
 # Load IUCN range maps
 range_maps = st_read(
@@ -52,7 +52,7 @@ range_maps = range_maps %>%
 save(range_maps, file = "data/r_objects/range_maps.RData")
 # -------------------------------------------#
-#       Process gridded range maps        ####
+# Process gridded range maps              ####
 # -------------------------------------------#
 range_maps_gridded = rnaturalearth::ne_countries() %>% 
  dplyr::filter(continent == "South America") %>% 
@@ -65,10 +65,11 @@ range_maps_gridded = rnaturalearth::ne_countries() %>%
 save(range_maps_gridded, file = "data/r_objects/range_maps_gridded.RData")
 # ----------------------------------------------#
-#       Calculate range dissimilarity        ####
+# Calculate range dissimilarity              ####
 # ----------------------------------------------#
 load("data/r_objects/range_maps_gridded.RData")
+# Get grid ids
 range_maps_gridded_id = range_maps_gridded %>% 
  dplyr::group_by(geometry) %>%
  dplyr::mutate(geom_id = cur_group_id()) %>% 
@@ -79,6 +80,7 @@ geometries_unique = range_maps_gridded_id %>%
  group_by(geom_id) %>% 
  slice_head(n = 1)
+# Calculate pairwise distances between grid cells
 geom_dist = sf::st_distance(geometries_unique, geometries_unique)  %>%  # Takes ~ 10 mins
  as.matrix()
@@ -86,6 +88,7 @@ range_maps_split = range_maps_gridded_id %>%
  group_by(name_matched) %>% 
  group_split()
+# Calculate mean minimum distance between grid cells of species A and B
 mean_minimum_distances = lapply(range_maps_split, function(df1){  # Takes ~ 30 mins
  df_dist = lapply(range_maps_split, function(df2){
    dists_subset = geom_dist[df1$geom_id, df2$geom_id, drop = F]
@@ -103,11 +106,13 @@ mean_minimum_distances = lapply(range_maps_split, function(df1){  # Takes ~ 30 m
  return(df_dist)
 }) %>% bind_rows()
+# Reshape into distance matrix
 range_dist = mean_minimum_distances %>% 
  pivot_wider(names_from = species2, values_from = dist) %>% 
  column_to_rownames("species1") %>% 
  as.matrix()
-range_dist = range_dist / max(range_dist) # Scale to (0,1)
+# Scale to (0,1)
+range_dist = range_dist / max(range_dist) 
 save(range_dist, file = "data/r_objects/range_dist.RData")
--- a/R/01_02_traits_preparation.R
+++ b/R/01_02_traits_preparation.R
@@ -44,7 +44,7 @@ save(functional_groups, file = "data/r_objects/functional_groups.RData")
 # ---------------------------------------------------#
 #        Process and merge functional traits      ####
 # ---------------------------------------------------#
-# Match names
+# Match taxonomic names
 traits = traitdata::elton_mammals
 names_unique = unique(traits$scientificNameStd)
@@ -84,21 +84,26 @@ traits_proc = traits_matched %>%
         match_epithet = stringr::str_detect(species, Species)) %>% 
  dplyr::group_by(species) %>% 
  dplyr::group_modify(~ {
+    # Only one record with accepted species name
    if (nrow(.x) == 1) { 
      return (.x)
    }
+    # If multiple records for accepted species name
    x_mod = .x
    while (nrow(x_mod) > 1){
+      # 1. Remove records with mismatched genus name, retry
      if(any(isFALSE(x_mod$match_genus))){
        x_mod = dplyr::filter(x_mod, isTRUE(match_genus))
        next
      }
+      # 1. Remove records with mismatched epithet name, retry
      if(any(isFALSE(x_mod$match_epithet))){
        x_mod = dplyr::filter(x_mod, isTRUE(match_epithet))
        next
      }
+      # If there are still multiple records with matching genus and epithet, just take the first record
      x_mod = x_mod[1,]
    }
@@ -108,7 +113,7 @@ traits_proc = traits_matched %>%
      return(.x[1,])
    }
  }) %>% 
-  dplyr::select(c("species", diet_cols, strata_cols, activity_cols, bodymass_cols))
+  dplyr::select(all_of(c("species", diet_cols, strata_cols, activity_cols, bodymass_cols)))
 save(traits_proc, file = "data/r_objects/traits_proc.RData")
@@ -122,10 +127,11 @@ traits_target = dplyr::filter(traits_proc, species %in% target_species)
 traits_target$`ForStrat.Value` = as.numeric(factor(traits_target$`ForStrat.Value`, levels = c("G", "S", "Ar", "A")))
 traits_target$`BodyMass.Value` = scale(log(traits_target$`BodyMass.Value`))
-diet_dist = vegan::vegdist(traits_target[,diet_cols], "bray")
+# Calculate distances for trait categories separately
-foraging_dist = dist(traits_target[,strata_cols])
+diet_dist = vegan::vegdist(traits_target[,diet_cols], "bray")   # Bray-Curtis
-activity_dist = vegan::vegdist(traits_target[,activity_cols], "bray")
+foraging_dist = dist(traits_target[,strata_cols])               # Euclidean
-bodymass_dist = dist(traits_target[,bodymass_cols])
+activity_dist = vegan::vegdist(traits_target[,activity_cols], "bray")   # Bray-Curtis
+bodymass_dist = dist(traits_target[,bodymass_cols])             # Euclidean
 func_dist = (diet_dist + foraging_dist/max(foraging_dist) + activity_dist + bodymass_dist/max(bodymass_dist)) / 4
 names(func_dist) = traits_target$species

--- a/R/01_03_phylo_preparation.R
+++ b/R/01_03_phylo_preparation.R
@@ -7,7 +7,7 @@ forest = read.nexus("data/phylogenies/Complete_phylogeny.nex")
 load("data/r_objects/range_maps_names_matched.RData")
 # ---------------------------------------------------#
-#        Match taxonomic names in phylogeny       ####
+# Match taxonomic names in phylogeny              ####
 # ---------------------------------------------------#
 phylo_names_unique = unique(forest$UNTITLED$tip.label)
@@ -31,9 +31,8 @@ phylo_names_matched  = lapply(phylo_names_unique, function(name){
 save(phylo_names_matched, file = "data/r_objects/phylo_names_matched.RData")
 # ---------------------------------------------------#
-#         Calculate phylo distances               ####
+# Calculate phylo distances                      ####
 # ---------------------------------------------------#
-# Calculate pairwise phylogenetic distances across a random sample 50 of forests
 # Trim forest to target species
 phylo_names_target = dplyr::filter(phylo_names_matched, name_matched %in% range_maps_names_matched$name_matched)
@@ -45,6 +44,7 @@ forest_pruned = lapply(forest, function(x) {
  return(x_pruned)
 })
+# Calculate pairwise phylogenetic distances across a random sample 50 of forests
 class(forest_pruned) <- "multiPhylo"
 indices = sample(length(forest_pruned), 50)
 dists = lapply(indices, function(i){

--- a/R/01_04_raster_preparation.R
+++ b/R/01_04_raster_preparation.R
 library(tidyverse)
 library(terra)
 library(rnaturalearth)
-library(furrr)
+library(sf)
 # Define Study extent ####
 sa_polygon = rnaturalearth::ne_countries(scale = 10, returnclass = "sf") %>% 
  dplyr::filter(continent == "South America") %>% 
  sf::st_union() %>% 
-  st_crop(xmin = -82, ymin = -56, xmax = -34, ymax = 13)
+  sf::st_crop(xmin = -82, ymin = -56, xmax = -34, ymax = 13)
 save(sa_polygon, file = "data/r_objects/sa_polygon.RData")

--- a/R/02_01_presence_data_preparation.R
+++ b/R/02_01_presence_data_preparation.R
@@ -14,11 +14,11 @@ library(DBI)
 source("R/utils.R")
-con = db_connect() # Connection to Symobio
+con = db_connect() # Connection to Symobio DB
-sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry
+# sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry
 # ---------------------------------------------------------------------------#
-# Prepare Geodata                                                         ####
+# Load Geodata                                                            ####
 # ---------------------------------------------------------------------------#
 load("data/r_objects/sa_polygon.RData")
@@ -34,7 +34,7 @@ load("data/r_objects/range_maps.RData")
 target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)])
 # Query species from Symobio
-occs = tbl(con, "species_occurrences") %>% 
+occs = tbl(con, "species_occurrences") %>% # This line connects to the "species_occurrences" table in SymobioDB
  dplyr::filter(species %in% target_species) %>% 
  dplyr::select(-year) %>% 
  dplyr::distinct() %>% 
@@ -69,7 +69,7 @@ occs_final = occs_flagged %>%
  dplyr::select(species, longitude, latitude)
 # Extract environmental data
-proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" 
+proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" # Equal Area projection for South America, created with https://projectionwizard.org/
 raster_data = terra::project(raster_data, proj_string)
 occs_final = st_transform(occs_final, proj_string)

--- a/R/03_03_model_data_finalization.R
+++ b/R/03_03_model_data_finalization.R
--- a/R/03_04_dataset_exploration.R
+++ b/R/03_04_dataset_exploration.R
@@ -12,6 +12,7 @@ sa_polygon = sf::st_transform(sa_polygon, crs(model_data))
 # ---------------------------------------#
 # Plot all presences in the dataset   ####
 # ---------------------------------------#
+# Prepare data
 data_extent = ext(model_data)
 presences = model_data %>% 
@@ -29,8 +30,10 @@ point_counts <- terra::rasterize(
 ) %>% 
  terra::mask(vect(sa_polygon))
-length(which(values(point_counts) > 0)) # only 36858 raster cells contain at least 1 presence record 
+length(which(values(point_counts) > 0)) 
+# --> only 36858 raster cells contain at least 1 presence record 
+# Create plot
 ggplot() +
  tidyterra::geom_spatraster(data = point_counts, maxcell = 5e7) +
  scale_fill_gradientn(
@@ -49,6 +52,7 @@ ggsave("coordinate_density.pdf", path = "plots/", device = "pdf", scale = 4)
 # ---------------------------------------#
 # Explore folds assignment            ####
 # ---------------------------------------#
+# Prepare data
 cv_plot(spatial_folds)
 ggsave(filename = "plots/publication/global_folds_map.pdf", device = "pdf", scale = 2.5)
@@ -74,6 +78,7 @@ folds_uniform = folds_summary %>%
  ) %>% 
  dplyr::ungroup()
+# Create plot
 plot_1 = ggplot(folds_uniform, aes(x=folds_total)) + 
  geom_histogram(binwidth=0.5, fill = "orange") + 
  labs(title="Number of folds per species", x="Count", y = "Number of species") +

--- a/R/04_01_modelling_ssdm.R
+++ b/R/04_01_modelling_ssdm.R
@@ -24,7 +24,7 @@ data_split = model_data %>%
 # ----------------------------------------------------------------------#
 # Set up parallel processing                                         ####
 # ----------------------------------------------------------------------#
-num_cores <- 2
+num_cores <- 2 # Some packages use internal parallelization, so be careful with increasing this param
 cl <- makeCluster(num_cores)
 registerDoParallel(cl)

--- a/R/04_02_modelling_msdm_embed.R
+++ b/R/04_02_modelling_msdm_embed.R
--- a/R/04_03_modelling_msdm_onehot.R
+++ b/R/04_03_modelling_msdm_onehot.R
--- a/R/04_04_modelling_msdm_rf.R
+++ b/R/04_04_modelling_msdm_rf.R
--- a/R/05_01_performance_report.qmd
+++ b/R/05_01_performance_report.qmd
--- a/R/05_02_publication_analysis.R
+++ b/R/05_02_publication_analysis.R