From 3ed7e5fdd5debfaf5df9be169d344a2afd9660b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Fri, 21 Mar 2025 12:24:00 +0100 Subject: [PATCH] Finished file restructuring + improved documentation --- R/01_01_range_preparation.R | 13 +++++++++---- R/01_02_traits_preparation.R | 18 ++++++++++++------ R/01_03_phylo_preparation.R | 6 +++--- R/01_04_raster_preparation.R | 4 ++-- R/02_01_presence_data_preparation.R | 10 +++++----- ...ation.R => 02_03_model_data_finalization.R} | 0 ...ploration.R => 02_04_dataset_exploration.R} | 7 ++++++- ...modelling_ssdm.R => 03_01_modelling_ssdm.R} | 2 +- ...dm_embed.R => 03_02_modelling_msdm_embed.R} | 0 ..._onehot.R => 03_03_modelling_msdm_onehot.R} | 0 ...ing_msdm_rf.R => 03_04_modelling_msdm_rf.R} | 0 ...report.qmd => 04_01_performance_report.qmd} | 0 ...analysis.R => 04_02_publication_analysis.R} | 0 13 files changed, 38 insertions(+), 22 deletions(-) rename R/{03_03_model_data_finalization.R => 02_03_model_data_finalization.R} (100%) rename R/{03_04_dataset_exploration.R => 02_04_dataset_exploration.R} (95%) rename R/{04_01_modelling_ssdm.R => 03_01_modelling_ssdm.R} (98%) rename R/{04_02_modelling_msdm_embed.R => 03_02_modelling_msdm_embed.R} (100%) rename R/{04_03_modelling_msdm_onehot.R => 03_03_modelling_msdm_onehot.R} (100%) rename R/{04_04_modelling_msdm_rf.R => 03_04_modelling_msdm_rf.R} (100%) rename R/{05_01_performance_report.qmd => 04_01_performance_report.qmd} (100%) rename R/{05_02_publication_analysis.R => 04_02_publication_analysis.R} (100%) diff --git a/R/01_01_range_preparation.R b/R/01_01_range_preparation.R index 4f6a412..487d756 100644 --- a/R/01_01_range_preparation.R +++ b/R/01_01_range_preparation.R @@ -7,7 +7,7 @@ library(Rfast) sf::sf_use_s2(use_s2 = FALSE) # --------------------------------------# -# Process range maps #### +# Process range maps #### # --------------------------------------# # Load IUCN range maps range_maps = st_read( @@ -52,7 +52,7 @@ range_maps = range_maps %>% save(range_maps, file = "data/r_objects/range_maps.RData") # -------------------------------------------# -# Process gridded range maps #### +# Process gridded range maps #### # -------------------------------------------# range_maps_gridded = rnaturalearth::ne_countries() %>% dplyr::filter(continent == "South America") %>% @@ -65,10 +65,11 @@ range_maps_gridded = rnaturalearth::ne_countries() %>% save(range_maps_gridded, file = "data/r_objects/range_maps_gridded.RData") # ----------------------------------------------# -# Calculate range dissimilarity #### +# Calculate range dissimilarity #### # ----------------------------------------------# load("data/r_objects/range_maps_gridded.RData") +# Get grid ids range_maps_gridded_id = range_maps_gridded %>% dplyr::group_by(geometry) %>% dplyr::mutate(geom_id = cur_group_id()) %>% @@ -79,6 +80,7 @@ geometries_unique = range_maps_gridded_id %>% group_by(geom_id) %>% slice_head(n = 1) +# Calculate pairwise distances between grid cells geom_dist = sf::st_distance(geometries_unique, geometries_unique) %>% # Takes ~ 10 mins as.matrix() @@ -86,6 +88,7 @@ range_maps_split = range_maps_gridded_id %>% group_by(name_matched) %>% group_split() +# Calculate mean minimum distance between grid cells of species A and B mean_minimum_distances = lapply(range_maps_split, function(df1){ # Takes ~ 30 mins df_dist = lapply(range_maps_split, function(df2){ dists_subset = geom_dist[df1$geom_id, df2$geom_id, drop = F] @@ -103,11 +106,13 @@ mean_minimum_distances = lapply(range_maps_split, function(df1){ # Takes ~ 30 m return(df_dist) }) %>% bind_rows() +# Reshape into distance matrix range_dist = mean_minimum_distances %>% pivot_wider(names_from = species2, values_from = dist) %>% column_to_rownames("species1") %>% as.matrix() -range_dist = range_dist / max(range_dist) # Scale to (0,1) +# Scale to (0,1) +range_dist = range_dist / max(range_dist) save(range_dist, file = "data/r_objects/range_dist.RData") diff --git a/R/01_02_traits_preparation.R b/R/01_02_traits_preparation.R index 0e48083..c594b42 100644 --- a/R/01_02_traits_preparation.R +++ b/R/01_02_traits_preparation.R @@ -44,7 +44,7 @@ save(functional_groups, file = "data/r_objects/functional_groups.RData") # ---------------------------------------------------# # Process and merge functional traits #### # ---------------------------------------------------# -# Match names +# Match taxonomic names traits = traitdata::elton_mammals names_unique = unique(traits$scientificNameStd) @@ -84,21 +84,26 @@ traits_proc = traits_matched %>% match_epithet = stringr::str_detect(species, Species)) %>% dplyr::group_by(species) %>% dplyr::group_modify(~ { + # Only one record with accepted species name if (nrow(.x) == 1) { return (.x) } + # If multiple records for accepted species name x_mod = .x while (nrow(x_mod) > 1){ + # 1. Remove records with mismatched genus name, retry if(any(isFALSE(x_mod$match_genus))){ x_mod = dplyr::filter(x_mod, isTRUE(match_genus)) next } + # 1. Remove records with mismatched epithet name, retry if(any(isFALSE(x_mod$match_epithet))){ x_mod = dplyr::filter(x_mod, isTRUE(match_epithet)) next } + # If there are still multiple records with matching genus and epithet, just take the first record x_mod = x_mod[1,] } @@ -108,7 +113,7 @@ traits_proc = traits_matched %>% return(.x[1,]) } }) %>% - dplyr::select(c("species", diet_cols, strata_cols, activity_cols, bodymass_cols)) + dplyr::select(all_of(c("species", diet_cols, strata_cols, activity_cols, bodymass_cols))) save(traits_proc, file = "data/r_objects/traits_proc.RData") @@ -122,10 +127,11 @@ traits_target = dplyr::filter(traits_proc, species %in% target_species) traits_target$`ForStrat.Value` = as.numeric(factor(traits_target$`ForStrat.Value`, levels = c("G", "S", "Ar", "A"))) traits_target$`BodyMass.Value` = scale(log(traits_target$`BodyMass.Value`)) -diet_dist = vegan::vegdist(traits_target[,diet_cols], "bray") -foraging_dist = dist(traits_target[,strata_cols]) -activity_dist = vegan::vegdist(traits_target[,activity_cols], "bray") -bodymass_dist = dist(traits_target[,bodymass_cols]) +# Calculate distances for trait categories separately +diet_dist = vegan::vegdist(traits_target[,diet_cols], "bray") # Bray-Curtis +foraging_dist = dist(traits_target[,strata_cols]) # Euclidean +activity_dist = vegan::vegdist(traits_target[,activity_cols], "bray") # Bray-Curtis +bodymass_dist = dist(traits_target[,bodymass_cols]) # Euclidean func_dist = (diet_dist + foraging_dist/max(foraging_dist) + activity_dist + bodymass_dist/max(bodymass_dist)) / 4 names(func_dist) = traits_target$species diff --git a/R/01_03_phylo_preparation.R b/R/01_03_phylo_preparation.R index 4ba9a85..806e4f6 100644 --- a/R/01_03_phylo_preparation.R +++ b/R/01_03_phylo_preparation.R @@ -7,7 +7,7 @@ forest = read.nexus("data/phylogenies/Complete_phylogeny.nex") load("data/r_objects/range_maps_names_matched.RData") # ---------------------------------------------------# -# Match taxonomic names in phylogeny #### +# Match taxonomic names in phylogeny #### # ---------------------------------------------------# phylo_names_unique = unique(forest$UNTITLED$tip.label) @@ -31,9 +31,8 @@ phylo_names_matched = lapply(phylo_names_unique, function(name){ save(phylo_names_matched, file = "data/r_objects/phylo_names_matched.RData") # ---------------------------------------------------# -# Calculate phylo distances #### +# Calculate phylo distances #### # ---------------------------------------------------# -# Calculate pairwise phylogenetic distances across a random sample 50 of forests # Trim forest to target species phylo_names_target = dplyr::filter(phylo_names_matched, name_matched %in% range_maps_names_matched$name_matched) @@ -45,6 +44,7 @@ forest_pruned = lapply(forest, function(x) { return(x_pruned) }) +# Calculate pairwise phylogenetic distances across a random sample 50 of forests class(forest_pruned) <- "multiPhylo" indices = sample(length(forest_pruned), 50) dists = lapply(indices, function(i){ diff --git a/R/01_04_raster_preparation.R b/R/01_04_raster_preparation.R index aa9e210..fdee825 100644 --- a/R/01_04_raster_preparation.R +++ b/R/01_04_raster_preparation.R @@ -1,13 +1,13 @@ library(tidyverse) library(terra) library(rnaturalearth) -library(furrr) +library(sf) # Define Study extent #### sa_polygon = rnaturalearth::ne_countries(scale = 10, returnclass = "sf") %>% dplyr::filter(continent == "South America") %>% sf::st_union() %>% - st_crop(xmin = -82, ymin = -56, xmax = -34, ymax = 13) + sf::st_crop(xmin = -82, ymin = -56, xmax = -34, ymax = 13) save(sa_polygon, file = "data/r_objects/sa_polygon.RData") diff --git a/R/02_01_presence_data_preparation.R b/R/02_01_presence_data_preparation.R index 7a2697e..48237cd 100644 --- a/R/02_01_presence_data_preparation.R +++ b/R/02_01_presence_data_preparation.R @@ -14,11 +14,11 @@ library(DBI) source("R/utils.R") -con = db_connect() # Connection to Symobio -sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry +con = db_connect() # Connection to Symobio DB +# sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry # ---------------------------------------------------------------------------# -# Prepare Geodata #### +# Load Geodata #### # ---------------------------------------------------------------------------# load("data/r_objects/sa_polygon.RData") @@ -34,7 +34,7 @@ load("data/r_objects/range_maps.RData") target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)]) # Query species from Symobio -occs = tbl(con, "species_occurrences") %>% +occs = tbl(con, "species_occurrences") %>% # This line connects to the "species_occurrences" table in SymobioDB dplyr::filter(species %in% target_species) %>% dplyr::select(-year) %>% dplyr::distinct() %>% @@ -69,7 +69,7 @@ occs_final = occs_flagged %>% dplyr::select(species, longitude, latitude) # Extract environmental data -proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" +proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" # Equal Area projection for South America, created with https://projectionwizard.org/ raster_data = terra::project(raster_data, proj_string) occs_final = st_transform(occs_final, proj_string) diff --git a/R/03_03_model_data_finalization.R b/R/02_03_model_data_finalization.R similarity index 100% rename from R/03_03_model_data_finalization.R rename to R/02_03_model_data_finalization.R diff --git a/R/03_04_dataset_exploration.R b/R/02_04_dataset_exploration.R similarity index 95% rename from R/03_04_dataset_exploration.R rename to R/02_04_dataset_exploration.R index 53a627a..01d3632 100644 --- a/R/03_04_dataset_exploration.R +++ b/R/02_04_dataset_exploration.R @@ -12,6 +12,7 @@ sa_polygon = sf::st_transform(sa_polygon, crs(model_data)) # ---------------------------------------# # Plot all presences in the dataset #### # ---------------------------------------# +# Prepare data data_extent = ext(model_data) presences = model_data %>% @@ -29,8 +30,10 @@ point_counts <- terra::rasterize( ) %>% terra::mask(vect(sa_polygon)) -length(which(values(point_counts) > 0)) # only 36858 raster cells contain at least 1 presence record +length(which(values(point_counts) > 0)) +# --> only 36858 raster cells contain at least 1 presence record +# Create plot ggplot() + tidyterra::geom_spatraster(data = point_counts, maxcell = 5e7) + scale_fill_gradientn( @@ -49,6 +52,7 @@ ggsave("coordinate_density.pdf", path = "plots/", device = "pdf", scale = 4) # ---------------------------------------# # Explore folds assignment #### # ---------------------------------------# +# Prepare data cv_plot(spatial_folds) ggsave(filename = "plots/publication/global_folds_map.pdf", device = "pdf", scale = 2.5) @@ -74,6 +78,7 @@ folds_uniform = folds_summary %>% ) %>% dplyr::ungroup() +# Create plot plot_1 = ggplot(folds_uniform, aes(x=folds_total)) + geom_histogram(binwidth=0.5, fill = "orange") + labs(title="Number of folds per species", x="Count", y = "Number of species") + diff --git a/R/04_01_modelling_ssdm.R b/R/03_01_modelling_ssdm.R similarity index 98% rename from R/04_01_modelling_ssdm.R rename to R/03_01_modelling_ssdm.R index a62cd3e..d21855d 100644 --- a/R/04_01_modelling_ssdm.R +++ b/R/03_01_modelling_ssdm.R @@ -24,7 +24,7 @@ data_split = model_data %>% # ----------------------------------------------------------------------# # Set up parallel processing #### # ----------------------------------------------------------------------# -num_cores <- 2 +num_cores <- 2 # Some packages use internal parallelization, so be careful with increasing this param cl <- makeCluster(num_cores) registerDoParallel(cl) diff --git a/R/04_02_modelling_msdm_embed.R b/R/03_02_modelling_msdm_embed.R similarity index 100% rename from R/04_02_modelling_msdm_embed.R rename to R/03_02_modelling_msdm_embed.R diff --git a/R/04_03_modelling_msdm_onehot.R b/R/03_03_modelling_msdm_onehot.R similarity index 100% rename from R/04_03_modelling_msdm_onehot.R rename to R/03_03_modelling_msdm_onehot.R diff --git a/R/04_04_modelling_msdm_rf.R b/R/03_04_modelling_msdm_rf.R similarity index 100% rename from R/04_04_modelling_msdm_rf.R rename to R/03_04_modelling_msdm_rf.R diff --git a/R/05_01_performance_report.qmd b/R/04_01_performance_report.qmd similarity index 100% rename from R/05_01_performance_report.qmd rename to R/04_01_performance_report.qmd diff --git a/R/05_02_publication_analysis.R b/R/04_02_publication_analysis.R similarity index 100% rename from R/05_02_publication_analysis.R rename to R/04_02_publication_analysis.R -- GitLab