diff --git a/.gitignore b/.gitignore index 3acf079a9890636921e00602124682ed4a556f26..4d2ad7cd89b5eac9b061262753fe098518070806 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,4 @@ renv/cache/ data/ plots/ R/*/ -R/*.html \ No newline at end of file +*.html \ No newline at end of file diff --git a/R/01_01_range_map_preparation.R b/R/01_01_range_preparation.R similarity index 99% rename from R/01_01_range_map_preparation.R rename to R/01_01_range_preparation.R index eb33fdc1b726ab35bf7fa5bd97ec638740f30600..4f6a412243451e6b38f5ecc8358aa95083cc4164 100644 --- a/R/01_01_range_map_preparation.R +++ b/R/01_01_range_preparation.R @@ -9,7 +9,7 @@ sf::sf_use_s2(use_s2 = FALSE) # --------------------------------------# # Process range maps #### # --------------------------------------# -# Load range maps +# Load IUCN range maps range_maps = st_read( "~/share/groups/mas_data/Saved_Data_Dropbox_Business/Datensätze/Range Maps/IUCN_range_maps_mammals_version2016/TERRESTRIAL_MAMMALS.shp", geometry_column = "geometry", diff --git a/R/01_02_raster_preparation.R b/R/01_02_raster_preparation.R deleted file mode 100644 index e7876e935c8b77377e003bb17490e704abc9246e..0000000000000000000000000000000000000000 --- a/R/01_02_raster_preparation.R +++ /dev/null @@ -1,36 +0,0 @@ -library(tidyverse) -library(terra) -library(rnaturalearth) -library(furrr) - -# Define Study extent #### -sa_polygon = rnaturalearth::ne_countries(scale = 10, returnclass = "sf") %>% - dplyr::filter(continent == "South America") %>% - sf::st_union() %>% - st_crop(xmin = -82, ymin = -56, xmax = -34, ymax = 13) - -save(sa_polygon, file = "data/r_objects/sa_polygon.RData") - -# CHELSA bioclim target variables #### -chelsa_urls = paste0("https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_bio", 1:19,"_1981-2010_V.2.1.tif") - -chelsa = terra::rast(chelsa_urls) %>% - crop(sa_polygon) -set.names(chelsa, paste0("bio", 1:19)) -terra::writeRaster(chelsa, filename = file.path("data", "geospatial", "raster", basename(chelsa_urls)), overwrite = T) - -# Tree canopy cover #### -igfc = list.files("I:/mas_data/00_data/processed/iGFC/", pattern = "canopyDensity_(199[0-9]|200[0-9]|2010)", full.names = TRUE) %>% - terra::rast() %>% - terra::resample(chelsa) %>% - app(mean, na.rm = TRUE) -terra::set.names(igfc, "igfc") -terra::writeRaster(igfc, filename = file.path("data", "geospatial", "raster", "mean_canopyDensity_1992-2010.tif"), overwrite = T) - -# Terrain #### -roughness = list.files("C:/Users/ye87zine/Downloads", pattern = "roughness", full.names = T) %>% - terra::rast() %>% - crop(study_extent) %>% - terra::resample(chelsa) -terra::set.names(roughness, "roughness") -terra::writeRaster(roughness, filename = file.path("data", "geospatial", "raster", "terrain_roughness.tif"), overwrite = T) diff --git a/R/02_02_functional_traits_preparation.R b/R/01_02_traits_preparation.R similarity index 64% rename from R/02_02_functional_traits_preparation.R rename to R/01_02_traits_preparation.R index dfec715e7946b3ed97f82c21763d1798622a11c4..0e480830dfd26d97657bcc3926131fece0d5616b 100644 --- a/R/02_02_functional_traits_preparation.R +++ b/R/01_02_traits_preparation.R @@ -1,6 +1,49 @@ library("tidyverse") library("traitdata") +library("Symobio") +library("vegan") + +load("data/r_objects/range_maps.RData") + +# -------------------------------------------# +# Assign functional groups #### +# -------------------------------------------# +# Get taxonomic information for target species +names_unique = unique(range_maps$name_matched[!is.na(range_maps$name_matched)]) + +species_matched = lapply(names_unique, function(name){ + match_result = Symobio::gbif_match_name(name = name) + if(match_result$status != "ACCEPTED"){ # Search for accepted name + match_result = gbif_match_name(usageKey = match_result$acceptedUsageKey) + } + match_result$name_orig = name + return(match_result) +}) %>% + bind_rows() + +# Assign functional groups +functional_groups = species_matched %>% + mutate( + functional_group = case_when( + order %in% c("Carnivora", "Artiodactyla", "Cingulata", "Perissodactyla") ~ 1, + order %in% c("Rodentia", "Didelphimorphia", "Soricomorpha", "Paucituberculata", "Lagomorpha") ~ 2, + order %in% c("Primates", "Pilosa") ~ 3, + order %in% c("Chiroptera") ~ 4 + ), + functional_group = factor(functional_group, labels = c("large ground-dwelling", "small ground-dwelling", "arboreal", "flying")) + ) %>% + dplyr::select( + name_orig, + name_matched = species, + functional_group + ) %>% + distinct() +save(functional_groups, file = "data/r_objects/functional_groups.RData") + +# ---------------------------------------------------# +# Process and merge functional traits #### +# ---------------------------------------------------# # Match names traits = traitdata::elton_mammals names_unique = unique(traits$scientificNameStd) @@ -69,12 +112,9 @@ traits_proc = traits_matched %>% save(traits_proc, file = "data/r_objects/traits_proc.RData") -# Calculate Distances -library("vegan") - -load("data/r_objects/range_maps.RData") -load("data/r_objects/traits_proc.RData") - +# ---------------------------------------------------# +# Calculate functional dissimilarity #### +# ---------------------------------------------------# target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)]) traits_target = dplyr::filter(traits_proc, species %in% target_species) diff --git a/R/02_03_phylo_preparation.R b/R/01_03_phylo_preparation.R similarity index 84% rename from R/02_03_phylo_preparation.R rename to R/01_03_phylo_preparation.R index a72c8332e080f96fa3c45aab2a7e955fa5be0f81..4ba9a85f34c5d325b4383234363b0bc06b339804 100644 --- a/R/02_03_phylo_preparation.R +++ b/R/01_03_phylo_preparation.R @@ -6,7 +6,9 @@ library(Symobio) forest = read.nexus("data/phylogenies/Complete_phylogeny.nex") load("data/r_objects/range_maps_names_matched.RData") -# Get taxonomic information for target species +# ---------------------------------------------------# +# Match taxonomic names in phylogeny #### +# ---------------------------------------------------# phylo_names_unique = unique(forest$UNTITLED$tip.label) phylo_names_matched = lapply(phylo_names_unique, function(name){ @@ -28,6 +30,10 @@ phylo_names_matched = lapply(phylo_names_unique, function(name){ save(phylo_names_matched, file = "data/r_objects/phylo_names_matched.RData") +# ---------------------------------------------------# +# Calculate phylo distances #### +# ---------------------------------------------------# +# Calculate pairwise phylogenetic distances across a random sample 50 of forests # Trim forest to target species phylo_names_target = dplyr::filter(phylo_names_matched, name_matched %in% range_maps_names_matched$name_matched) @@ -40,8 +46,6 @@ forest_pruned = lapply(forest, function(x) { }) class(forest_pruned) <- "multiPhylo" - -# Calculate pairwise phylogenetic distances across a random sample 50 of forests indices = sample(length(forest_pruned), 50) dists = lapply(indices, function(i){ cophenetic.phylo(forest_pruned[[i]]) diff --git a/R/01_04_raster_preparation.R b/R/01_04_raster_preparation.R new file mode 100644 index 0000000000000000000000000000000000000000..aa9e21078be7373c2b25a678aecb1ae298f19c3b --- /dev/null +++ b/R/01_04_raster_preparation.R @@ -0,0 +1,20 @@ +library(tidyverse) +library(terra) +library(rnaturalearth) +library(furrr) + +# Define Study extent #### +sa_polygon = rnaturalearth::ne_countries(scale = 10, returnclass = "sf") %>% + dplyr::filter(continent == "South America") %>% + sf::st_union() %>% + st_crop(xmin = -82, ymin = -56, xmax = -34, ymax = 13) + +save(sa_polygon, file = "data/r_objects/sa_polygon.RData") + +# CHELSA bioclim target variables #### +chelsa_urls = paste0("https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_bio", 1:19,"_1981-2010_V.2.1.tif") + +chelsa = terra::rast(chelsa_urls) %>% + crop(sa_polygon) +set.names(chelsa, paste0("bio", 1:19)) +terra::writeRaster(chelsa, filename = file.path("data", "geospatial", "raster", basename(chelsa_urls)), overwrite = T) \ No newline at end of file diff --git a/R/02_01_functional_group_assignment.R b/R/02_01_functional_group_assignment.R deleted file mode 100644 index ee2a06f6390e6ea98475cf654867a026a385e644..0000000000000000000000000000000000000000 --- a/R/02_01_functional_group_assignment.R +++ /dev/null @@ -1,34 +0,0 @@ -load("data/r_objects/range_maps.RData") - -# Get taxonomic information for target species -names_unique = unique(range_maps$name_matched[!is.na(range_maps$name_matched)]) - -species_matched = lapply(names_unique, function(name){ - match_result = Symobio::gbif_match_name(name = name) - if(match_result$status != "ACCEPTED"){ - match_result = gbif_match_name(usageKey = match_result$acceptedUsageKey) - } - match_result$name_orig = name - return(match_result) -}) %>% - bind_rows() - -# Assign functional groups -functional_groups = species_matched %>% - mutate( - functional_group = case_when( - order %in% c("Carnivora", "Artiodactyla", "Cingulata", "Perissodactyla") ~ 1, - order %in% c("Rodentia", "Didelphimorphia", "Soricomorpha", "Paucituberculata", "Lagomorpha") ~ 2, - order %in% c("Primates", "Pilosa") ~ 3, - order %in% c("Chiroptera") ~ 4 - ), - functional_group = factor(functional_group, labels = c("large ground-dwelling", "small ground-dwelling", "arboreal", "flying")) - ) %>% - dplyr::select( - name_orig, - name_matched = species, - functional_group - ) %>% - distinct() - -save(functional_groups, file = "data/r_objects/functional_groups.RData") diff --git a/R/03_01_presence_preparation.R b/R/02_01_presence_data_preparation.R similarity index 100% rename from R/03_01_presence_preparation.R rename to R/02_01_presence_data_preparation.R diff --git a/R/03_02_absence_preparation.R b/R/02_02_absence_data_preparation.R similarity index 100% rename from R/03_02_absence_preparation.R rename to R/02_02_absence_data_preparation.R diff --git a/R/03_03_dataset_exploration.R b/R/03_03_dataset_exploration.R deleted file mode 100644 index ad331734226f8f071007abe5e739d5fd83df1d4b..0000000000000000000000000000000000000000 --- a/R/03_03_dataset_exploration.R +++ /dev/null @@ -1,53 +0,0 @@ -library(tidyverse) -library(sf) -library(terra) - -source("R/utils.R") - -load("data/r_objects/model_data.RData") -load("data/r_objects/sa_polygon.RData") - -sa_polygon = sf::st_transform(sa_polygon, crs(model_data)) - -# ---------------------------------------# -# Plot all presences in the dataset #### -# ---------------------------------------# -data_extent = ext(model_data) - -presences = model_data %>% - dplyr::filter(present == 1) - -template_raster <- terra::rast(data_extent, resolution = 1000) # 1000m = 1km -crs(template_raster) <- st_crs(model_data)$wkt - -point_counts <- terra::rasterize( - vect(presences), - template_raster, - field = NULL, - fun = "count", - background = 0 -) %>% - terra::mask(vect(sa_polygon)) - -length(which(values(point_counts) > 0)) - -# only 36858 raster cells contain at least 1 presence record - -ggplot() + - tidyterra::geom_spatraster(data = point_counts, maxcell = 5e7) + - scale_fill_gradientn( - colors = c("black", "yellow", "darkred"), - values = scales::rescale(c(0, 1, max(point_counts_df$count, na.rm = TRUE))), - breaks = c(0, 1, max(point_counts_df$count, na.rm = TRUE)), - na.value = "transparent" - ) + - geom_sf(data = sa_polygon, fill = NA, color = "gray", size = 0.5) + - theme_minimal() + - coord_sf() + - labs(title = "Coordinate distribution and density") - -ggsave("coordinate_density.pdf", path = "plots/", device = "pdf", scale = 4) - -# ---------------------------------------# -# Plot all presences in the dataset #### -# ---------------------------------------# diff --git a/R/03_03_model_data_finalization.R b/R/03_03_model_data_finalization.R index 8631c4d909f21b0800356ddbd51291aace4af9c2..e0addfa3f3b88ae7499371acfa6c4def0854e20f 100644 --- a/R/03_03_model_data_finalization.R +++ b/R/03_03_model_data_finalization.R @@ -31,49 +31,5 @@ model_data = bind_rows(model_data_core, model_data_background) %>% relocate(contains("fold"), .after = last_col()) %>% mutate(species = as.factor(species)) -# Explore folds assignment -cv_plot(spatial_folds) -ggsave(filename = "plots/publication/global_folds_map.pdf", device = "pdf", scale = 2.5) - -folds_summary = model_data %>% - dplyr::filter(record_type == "core") %>% - sf::st_drop_geometry() %>% - dplyr::group_by(species, fold_global) %>% - dplyr::summarise( - n_occ = length(which(present == 1)), - n_abs = length(which(present == 0)) - ) - -folds_uniform = folds_summary %>% - dplyr::group_by(species, fold_global) %>% - dplyr::summarise( - is_uniform = n_occ == 0 | n_abs == 0 - ) %>% - dplyr::group_by(species) %>% - dplyr::summarise( - folds_total = n(), - folds_uniform = length(which(is_uniform)), - folds_mixed = folds_total - folds_uniform - ) %>% - dplyr::ungroup() - -plot_1 = ggplot(folds_uniform, aes(x=folds_total)) + - geom_histogram(binwidth=0.5, fill = "orange") + - labs(title="Number of folds per species", x="Count", y = "Number of species") + - theme_minimal() -plot_2 = ggplot(folds_uniform, aes(x=folds_uniform)) + - geom_histogram(binwidth=0.5, fill = "violet") + - labs(title="Number of uniform folds per species", x="Count", y = "Number of species", - subtitle="Uniform folds contain only absences or only presences and cannot be used in model evaluation.") + - theme_minimal() -plot_3 = ggplot(folds_uniform, aes(x=folds_mixed)) + - geom_histogram(binwidth=0.5, fill = "blue") + - labs(title="Number of mixed folds per species", x="Count", y = "Number of species", - subtitle="Mixed folds both absences and presences and can be used in model evaluation.") + - theme_minimal() - -plot_combined = grid.arrange(plot_1, plot_2, plot_3, ncol = 1) -ggsave(filename = "plots/publication/global_folds_histogram.pdf", plot_combined, device = "pdf", height = 10, width = 7) - # Save Model data -save(model_data, file = "data/r_objects/model_data.RData") \ No newline at end of file +save(model_data, file = "data/r_objects/model_data.RData") diff --git a/R/03_04_dataset_exploration.R b/R/03_04_dataset_exploration.R new file mode 100644 index 0000000000000000000000000000000000000000..53a627ac1929a6f054f614ac05707123e7aea6c8 --- /dev/null +++ b/R/03_04_dataset_exploration.R @@ -0,0 +1,95 @@ +library(tidyverse) +library(sf) +library(terra) + +source("R/utils.R") + +load("data/r_objects/model_data.RData") +load("data/r_objects/sa_polygon.RData") + +sa_polygon = sf::st_transform(sa_polygon, crs(model_data)) + +# ---------------------------------------# +# Plot all presences in the dataset #### +# ---------------------------------------# +data_extent = ext(model_data) + +presences = model_data %>% + dplyr::filter(present == 1) + +template_raster <- terra::rast(data_extent, resolution = 1000) # 1000m = 1km +crs(template_raster) <- st_crs(model_data)$wkt + +point_counts <- terra::rasterize( + vect(presences), + template_raster, + field = NULL, + fun = "count", + background = 0 +) %>% + terra::mask(vect(sa_polygon)) + +length(which(values(point_counts) > 0)) # only 36858 raster cells contain at least 1 presence record + +ggplot() + + tidyterra::geom_spatraster(data = point_counts, maxcell = 5e7) + + scale_fill_gradientn( + colors = c("black", "yellow", "darkred"), + values = scales::rescale(c(0, 1, max(point_counts_df$count, na.rm = TRUE))), + breaks = c(0, 1, max(point_counts_df$count, na.rm = TRUE)), + na.value = "transparent" + ) + + geom_sf(data = sa_polygon, fill = NA, color = "gray", size = 0.5) + + theme_minimal() + + coord_sf() + + labs(title = "Coordinate distribution and density") + +ggsave("coordinate_density.pdf", path = "plots/", device = "pdf", scale = 4) + +# ---------------------------------------# +# Explore folds assignment #### +# ---------------------------------------# +cv_plot(spatial_folds) +ggsave(filename = "plots/publication/global_folds_map.pdf", device = "pdf", scale = 2.5) + +folds_summary = model_data %>% + dplyr::filter(record_type == "core") %>% + sf::st_drop_geometry() %>% + dplyr::group_by(species, fold_global) %>% + dplyr::summarise( + n_occ = length(which(present == 1)), + n_abs = length(which(present == 0)) + ) + +folds_uniform = folds_summary %>% + dplyr::group_by(species, fold_global) %>% + dplyr::summarise( + is_uniform = n_occ == 0 | n_abs == 0 + ) %>% + dplyr::group_by(species) %>% + dplyr::summarise( + folds_total = n(), + folds_uniform = length(which(is_uniform)), + folds_mixed = folds_total - folds_uniform + ) %>% + dplyr::ungroup() + +plot_1 = ggplot(folds_uniform, aes(x=folds_total)) + + geom_histogram(binwidth=0.5, fill = "orange") + + labs(title="Number of folds per species", x="Count", y = "Number of species") + + theme_minimal() +plot_2 = ggplot(folds_uniform, aes(x=folds_uniform)) + + geom_histogram(binwidth=0.5, fill = "violet") + + labs(title="Number of uniform folds per species", x="Count", y = "Number of species", + subtitle="Uniform folds contain only absences or only presences and cannot be used in model evaluation.") + + theme_minimal() +plot_3 = ggplot(folds_uniform, aes(x=folds_mixed)) + + geom_histogram(binwidth=0.5, fill = "blue") + + labs(title="Number of mixed folds per species", x="Count", y = "Number of species", + subtitle="Mixed folds both absences and presences and can be used in model evaluation.") + + theme_minimal() + +plot_combined = grid.arrange(plot_1, plot_2, plot_3, ncol = 1) +ggsave(filename = "plots/publication/global_folds_histogram.pdf", plot_combined, device = "pdf", height = 10, width = 7) + + diff --git a/R/04_01_modelling_ssdm.R b/R/04_01_modelling_ssdm.R index 77c25ece6da4f63726d7ca1f31bae6a490cda626..a62cd3e0efecb194d26f89ceaab31c58caf033b4 100644 --- a/R/04_01_modelling_ssdm.R +++ b/R/04_01_modelling_ssdm.R @@ -193,7 +193,7 @@ data_split = model_data %>% for(pa_spec in data_split){ ## Preparations #### species = pa_spec$species[1] - print(species) + print(as.character(species)) # Create inner CV folds for model training cv_train = blockCV::cv_spatial( @@ -231,7 +231,7 @@ for(pa_spec in data_split){ method = "ranger", trControl = train_ctrl, tuneLength = 8, - weights = data_train$weight, + weights = pa_spec$weight, verbose = F ) save(rf_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_rf_fit.RData")) @@ -246,7 +246,7 @@ for(pa_spec in data_split){ method = "gbm", trControl = train_ctrl, tuneLength = 8, - weights = data_train$weight, + weights = pa_spec$weight, verbose = F ) save(gbm_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gbm_fit.RData")) @@ -259,7 +259,7 @@ for(pa_spec in data_split){ y = pa_spec$present_fct, method = "gamSpline", tuneLength = 8, - weights = data_train$weight, + weights = pa_spec$weight, trControl = train_ctrl ) save(gam_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gam_fit.RData")) @@ -277,7 +277,7 @@ for(pa_spec in data_split){ burnin = 200L, early_stopping = 25, lr = 0.001, - batchsize = min(ceiling(nrow(data_train)/10), 64), + batchsize = min(ceiling(nrow(pa_spec)/10), 64), dropout = 0.25, optimizer = config_optimizer("adam"), validation = 0.2, diff --git a/R/04_04_modelling_msdm_rf.R b/R/04_04_modelling_msdm_rf.R index 4544b7321a86f88f54d501a9ac49e284575ca770..1e03f70098f0c9086cdd9afc515fbb3578d2704c 100644 --- a/R/04_04_modelling_msdm_rf.R +++ b/R/04_04_modelling_msdm_rf.R @@ -67,7 +67,7 @@ rf_fit = caret::train( metric = "Accuracy", trControl = train_ctrl, tuneLength = 8, - weights = full_data$weight, + weights = model_data$weight, num.threads = 48 ) diff --git a/R/05_01_performance_report.qmd b/R/05_01_performance_report.qmd index 8fb626b2f8adde35f089c3d5e1e3ae1826634066..38096cb26e46f1fdaf7f6917c449cf8905916bab 100644 --- a/R/05_01_performance_report.qmd +++ b/R/05_01_performance_report.qmd @@ -12,9 +12,9 @@ library(plotly) library(DT) load("../data/r_objects/model_data.RData") -#load("../data/r_objects/ssdm_performance.RData") +load("../data/r_objects/ssdm_performance.RData") load("../data/r_objects/msdm_embed_performance.RData") -#load("../data/r_objects/msdm_onehot_performance.RData") +load("../data/r_objects/msdm_onehot_performance.RData") load("../data/r_objects/msdm_rf_performance.RData") load("../data/r_objects/range_maps.RData") @@ -55,11 +55,32 @@ loess_fit = function(x, y, span = 0.75){ ) } +# Determine valid folds for evaluation +# A fold is valid if there are at least n presences in the training and validation data +obs_fold = model_data %>% + sf::st_drop_geometry() %>% + dplyr::filter(record_type == "core", present == 1) %>% + dplyr::group_by(species, fold_global) %>% + dplyr::summarise(obs_fold = n()) + +obs_total = model_data %>% + sf::st_drop_geometry() %>% + dplyr::filter(record_type == "core", present == 1) %>% + dplyr::group_by(species) %>% + dplyr::summarise(obs_total = n()) + +folds_valid = obs_fold %>% + dplyr::left_join(obs_total) %>% + dplyr::mutate(obs_train = obs_total-obs_fold) %>% + dplyr::filter(obs_train > 5 & obs_fold > 5) %>% + dplyr::select(species, fold_global) + # Performance table -performance = msdm_embed_performance %>% - #bind_rows(msdm_embed_performance) %>% - #bind_rows(msdm_onehot_performance) %>% +performance = ssdm_performance %>% + bind_rows(msdm_embed_performance) %>% + bind_rows(msdm_onehot_performance) %>% bind_rows(msdm_rf_performance) %>% + semi_join(folds_valid, by = c("species", "fold_global")) %>% dplyr::group_by(species, model, metric) %>% dplyr::summarise(value = mean(value, na.rm = T)) %>% dplyr::mutate( @@ -199,7 +220,7 @@ df_join = model_data %>% dplyr::summarise(n_obs = n()) performance = performance %>% - dplyr::left_join(df_join, by = "species") + dplyr::inner_join(df_join, by = "species") # Range size df_join = range_maps %>% @@ -377,7 +398,7 @@ Range size was calculated based on polygon layers from the IUCN Red List of Thre ```{r echo = FALSE} df_plot = dplyr::filter(performance, metric == "auc") -plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_size", x_label = "Range size [km]") +plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_size", x_label = "Range size [km²]") bslib::card(plot, full_screen = T) ``` @@ -385,7 +406,7 @@ bslib::card(plot, full_screen = T) ```{r echo = FALSE} df_plot = dplyr::filter(performance, metric == "f1") -plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_size", x_label = "Range size [km]") +plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_size", x_label = "Range size [km²]") bslib::card(plot, full_screen = T) ``` @@ -393,7 +414,7 @@ bslib::card(plot, full_screen = T) ```{r echo = FALSE} df_plot = dplyr::filter(performance, metric == "kappa") -plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_size", x_label = "Range size [km]") +plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_size", x_label = "Range size [km²]") bslib::card(plot, full_screen = T) ``` @@ -401,7 +422,7 @@ bslib::card(plot, full_screen = T) ```{r echo = FALSE} df_plot = dplyr::filter(performance, metric == "accuracy") -plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_size", x_label = "Range size [km]") +plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_size", x_label = "Range size [km²]") bslib::card(plot, full_screen = T) ``` @@ -409,7 +430,7 @@ bslib::card(plot, full_screen = T) ```{r echo = FALSE} df_plot = dplyr::filter(performance, metric == "precision") -plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_size", x_label = "Range size [km]") +plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_size", x_label = "Range size [km²]") bslib::card(plot, full_screen = T) ``` @@ -417,7 +438,7 @@ bslib::card(plot, full_screen = T) ```{r echo = FALSE} df_plot = dplyr::filter(performance, metric == "recall") -plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_size", x_label = "Range size [km]") +plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_size", x_label = "Range size [km²]") bslib::card(plot, full_screen = T) ``` ::: @@ -430,7 +451,7 @@ bslib::card(plot, full_screen = T) ```{r echo = FALSE} df_plot = dplyr::filter(performance, metric == "auc") %>% add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_size", x_label = "Range size [km]") +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_size", x_label = "Range size [km²]") bslib::card(plot, full_screen = T) ``` diff --git a/R/05_01_performance_report_rf.qmd b/R/05_01_performance_report_rf.qmd deleted file mode 100644 index 534a5f8924df3f579029fbc76be7c72bba0db14b..0000000000000000000000000000000000000000 --- a/R/05_01_performance_report_rf.qmd +++ /dev/null @@ -1,692 +0,0 @@ ---- -title: "SDM Performance report" -format: html -editor: visual -engine: knitr ---- - -```{r init, echo = FALSE, include = FALSE} -library(tidyverse) -library(sf) -library(plotly) -library(DT) - -load("../data/r_objects/model_data.RData") -load("../data/r_objects/range_maps.RData") -load("../data/r_objects/range_maps_gridded.RData") -load("../data/r_objects/occs_final.RData") -load("../data/r_objects/functional_groups.RData") - -sf::sf_use_s2(use_s2 = FALSE) - -# Load performance of different RF versions -load("../data/r_objects/deprecated/msdm_rf_no_species_random_abs_performance.RData") -load("../data/r_objects/deprecated/msdm_rf_no_species_performance.RData") -load("../data/r_objects/msdm_rf_performance.RData") - -msdm_rf_no_species_performance$model = "rf_noSpec_oldVars_informedAbs" -msdm_rf_no_species_performance$metric = tolower(msdm_rf_no_species_performance$metric) -msdm_rf_no_species_random_abs_performance$model = "rf_noSpec_oldVars_randomAbs" -msdm_rf_no_species_random_abs_performance$metric = tolower(msdm_rf_no_species_random_abs_performance$metric) -msdm_rf_performance$model = "rf_noSpec_newVars_randomAbs" -``` - -```{r globals, echo = FALSE, cache = TRUE, include = FALSE} -# Regression functions -asym_fit = function(x, y){ - nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) - new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) - data.frame( - x = new_x, - fit = predict(nls_fit, newdata = data.frame(x = new_x)) - ) -} - -glm_fit = function(x, y, family = "binomial"){ - glm_fit = suppressWarnings(glm(y~x, family = family)) - new_x = seq(min(x), max(x), length.out = 100) - data.frame( - x = new_x, - fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") - ) -} - -loess_fit = function(x, y, span = 0.75){ - df = data.frame(x = x, y = y) - loess_fit = loess(y ~ x, data = df, span = span) - new_x = seq(min(x), max(x), length.out = 100) - data.frame( - x = new_x, - fit = predict(loess_fit, newdata = data.frame(x = new_x)) - ) -} - -# Performance table -performance = msdm_rf_performance %>% - bind_rows(msdm_rf_no_species_performance) %>% - bind_rows(msdm_rf_no_species_random_abs_performance) %>% - dplyr::group_by(species, model, metric) %>% - dplyr::summarise(value = mean(value, na.rm = F)) %>% - dplyr::mutate( - value = case_when( - ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5, - ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0, - .default = value - ) - ) %>% - ungroup() - -plot_performance = function(df_plot, y_var, y_label, x_var, x_label, x_log = T, reg_func = loess_fit) { - df_plot = df_plot %>% - dplyr::rename( - x_var = !!x_var, - y_var = !!y_var - ) - - # Calculate regression lines for each model and metric combination - suppressWarnings({ - regression_lines = df_plot %>% - group_by(model) %>% - group_modify( ~ reg_func(.x[["x_var"]], .x[["y_var"]])) - }) - - # Create base plot - plot <- plot_ly() %>% - layout( - title = paste0("Model Performance vs. ", x_label), - xaxis = list(title = x_label, type = ifelse(x_log, "log", "linear")), - yaxis = list(title = y_label), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest' - ) - - # Points - for (model_name in unique(df_plot$model)) { - plot = plot %>% - add_markers( - data = filter(df_plot, model == model_name), - x = ~ x_var, - y = ~ y_var, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - opacity = 0.6, - name = ~ model, - hoverinfo = 'text', - text = ~ paste( - "Species:", species, "<br>", x_label, ":", x_var, "<br>Value:", round(y_var, 3) - ) - ) - } - - # Add regression lines - for (model_name in unique(df_plot$model)) { - reg_data = dplyr::filter(regression_lines, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~ x, - y = ~ fit, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(fit)'), - showlegend = FALSE - ) - } - - return(plot) -} - -add_partial_residuals = function(df, focus_var, control_vars, family = quasibinomial) { - model_formula <- as.formula(paste("value ~", paste(c(focus_var, control_vars), collapse = " + "))) - model = glm(model_formula, data = df, family = family) - - if (focus_var %in% names(model$coefficients)) { - df[[paste0(focus_var, "_partial")]] = residuals(model, type = "response") + model$coefficients[focus_var] * df[[focus_var]] - } - - return(df) -} - -``` - -## Summary - -This document contrasts the performance of different variations of random forest models. - -## Analysis - -### Quantify drivers of model performance - -```{r prepare_model_df, echo = FALSE, include = FALSE} -# Number of records -df_join = model_data %>% - sf::st_drop_geometry() %>% - dplyr::filter(present == 1) %>% - dplyr::group_by(species) %>% - dplyr::summarise(n_obs = n()) - -performance = performance %>% - dplyr::left_join(df_join, by = "species") - -# Range size -df_join = range_maps %>% - sf::st_transform("+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs") %>% - dplyr::mutate(range_size = as.numeric(sf::st_area(range_maps) / 1000000)) %>% # range in sqkm - sf::st_drop_geometry() - -performance = performance %>% - inner_join(df_join, by = c("species" = "name_matched")) - -# Range coverage -range_maps_gridded = range_maps_gridded %>% - sf::st_transform("+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs") - -df_cells_total = range_maps_gridded %>% - dplyr::rename("species" = name_matched) %>% - group_by(species) %>% - summarise(cells_total = n()) %>% - st_drop_geometry() - -df_cells_occ = range_maps_gridded %>% - st_join(occs_final, join = st_intersects) %>% - filter(name_matched == species) %>% # Filter only intersections of the same species - group_by(species) %>% - summarise(cells_occupied = n_distinct(geometry)) %>% - st_drop_geometry() - -df_join = df_cells_total %>% - dplyr::inner_join(df_cells_occ, by = "species") %>% - dplyr::mutate(range_cov = cells_occupied / cells_total) %>% - dplyr::select(species, range_cov) - -performance = performance %>% - inner_join(df_join, by = "species") - -# Range coverage bias -df_occs_total = occs_final %>% - st_drop_geometry() %>% - group_by(species) %>% - summarise(occs_total = n()) - -df_join = df_occs_total %>% - dplyr::inner_join(df_cells_total, by = "species") %>% - dplyr::inner_join(df_cells_occ, by = "species") %>% - dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1))) %>% - dplyr::select(species, range_bias) - -performance = performance %>% - inner_join(df_join, by = "species") -``` - -### Number of records - -::: panel-tabset -#### Marginal Effects - -::: panel-tabset -##### AUC - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "auc") -plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` - -##### F1 - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "f1") -plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` - -##### Cohen's kappa - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "kappa") -plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` - -##### Accuracy - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "accuracy") -plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` - -##### Precision - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "precision") -plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` - -##### Recall - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "recall") -plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` -::: - -#### Partial Effects - -::: panel-tabset -##### AUC - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "auc") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` - -##### F1 - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "f1") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` - -##### Cohen's kappa - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "kappa") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` - -##### Accuracy - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "accuracy") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` - -##### Precision - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "precision") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` - -##### Recall - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "recall") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "n_obs", x_label = "Number of records") -bslib::card(plot, full_screen = T) -``` -::: -::: - -### Range size - -Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016). - -::: panel-tabset -#### Marginal Effects - -::: panel-tabset -##### AUC - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "auc") -plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_size", x_label = "Range size [km]") -bslib::card(plot, full_screen = T) -``` - -##### F1 - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "f1") -plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_size", x_label = "Range size [km]") -bslib::card(plot, full_screen = T) -``` - -##### Cohen's kappa - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "kappa") -plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_size", x_label = "Range size [km]") -bslib::card(plot, full_screen = T) -``` - -##### Accuracy - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "accuracy") -plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_size", x_label = "Range size [km]") -bslib::card(plot, full_screen = T) -``` - -##### Precision - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "precision") -plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_size", x_label = "Range size [km]") -bslib::card(plot, full_screen = T) -``` - -##### Recall - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "recall") -plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_size", x_label = "Range size [km]") -bslib::card(plot, full_screen = T) -``` -::: - -#### Partial Effects - -::: panel-tabset -##### AUC - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "auc") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_size", x_label = "Range size [km]") -bslib::card(plot, full_screen = T) -``` - -##### F1 - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "f1") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_size", x_label = "Range size [km²]") -bslib::card(plot, full_screen = T) -``` - -##### Cohen's kappa - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "kappa") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_size", x_label = "Range size [km²]") -bslib::card(plot, full_screen = T) -``` - -##### Accuracy - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "accuracy") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_size", x_label = "Range size [km²]") -bslib::card(plot, full_screen = T) -``` - -##### Precision - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "precision") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_size", x_label = "Range size [km²]") -bslib::card(plot, full_screen = T) -``` - -##### Recall - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "recall") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_size", x_label = "Range size [km²]") -bslib::card(plot, full_screen = T) -``` -::: -::: - -### Range coverage - -Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells. - -$$ -RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}} -$$ - -::: panel-tabset -#### Marginal Effects - -::: panel-tabset -##### AUC - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "auc") -plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` - -##### F1 - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "f1") -plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` - -##### Cohen's kappa - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "kappa") -plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` - -##### Accuracy - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "accuracy") -plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` - -##### Precision - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "precision") -plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` - -##### Recall - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "recall") -plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` -::: - -#### Partial Effects - -::: panel-tabset -##### AUC - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "auc") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` - -##### F1 - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "f1") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` - -##### Cohen's kappa - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "kappa") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` - -##### Accuracy - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "accuracy") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` - -##### Precision - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "precision") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` - -##### Recall - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "recall") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_cov", x_label = "Range coverage") -bslib::card(plot, full_screen = T) -``` -::: -::: - -### Range coverage bias - -Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range. - -$$ -RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)} -$$ - -Higher bias values indicate that occurrence records are spatially more clustered within the range of the species. - -::: panel-tabset -#### Marginal Effects - -::: panel-tabset -##### AUC - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "auc") -plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` - -##### F1 - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "f1") -plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` - -##### Cohen's kappa - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "kappa") -plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` - -##### Accuracy - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "accuracy") -plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` - -##### Precision - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "precision") -plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` - -##### Recall - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "recall") -plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` -::: - -#### Partial Effects - -::: panel-tabset -##### AUC - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "auc") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` - -##### F1 - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "f1") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` - -##### Cohen's kappa - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "kappa") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` - -##### Accuracy - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "accuracy") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` - -##### Precision - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "precision") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` - -##### Recall - -```{r echo = FALSE} -df_plot = dplyr::filter(performance, metric == "recall") %>% - add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) -plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") -bslib::card(plot, full_screen = T) -``` -::: -::: diff --git a/R/05_02_publication_analysis.R b/R/05_02_publication_analysis.R index 4d347c4c53455fa3734dc3076e163aa68d0d2115..8f9bc5efb6235c1057daaa660651571863965df2 100644 --- a/R/05_02_publication_analysis.R +++ b/R/05_02_publication_analysis.R @@ -242,10 +242,10 @@ raster_data = terra::rast(raster_filepaths) %>% terra::crop(sf::st_bbox(sa_polygon)) %>% terra::project(sf::st_crs(model_data)$input) -specs = c("Histiotus velatus", "Hydrochoerus hydrochaeris", "Euryoryzomys legatus", "Eumops trumbulli") +specs = c("Aotus lemurinus") for(spec in specs){ pdf(file = paste0("plots/range_predictions/", spec, ".pdf"), width = 12) - plots = plot_predictions(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "msdm_onehot", "msdm_embed", "msdm_rf")) + plots = plot_predictions(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf")) lapply(plots, print) dev.off() } diff --git a/R/_publish.yml b/R/_publish.yml index 5604c706877eb12f339edad6480b654f76b4eab2..1da1e8787c9b0c9b22c24ac552ae323c00102459 100644 --- a/R/_publish.yml +++ b/R/_publish.yml @@ -2,8 +2,6 @@ quarto-pub: - id: 77b30762-b473-447d-be18-d4bbd6261fbf url: 'https://chrkoenig.quarto.pub/sdm-performance-report' - - id: 7777d450-82af-4364-b4c1-b52d7139ade0 - url: 'https://chrkoenig.quarto.pub/rf-nospec-performance-report' - source: 05_01_performance_report_rf.qmd quarto-pub: - id: 98e776a5-fb0b-41f9-8733-446bf7ffb272 diff --git a/README.md b/README.md index aaaa50cd194bb8d2807ab511b9ca8845a4352836..a26b7475873243a602c58480864a3895bfd9033c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,62 @@ -# Symobio-modeling +# Codebase Documentation -Code and data accompanying SSDM vs MSDM model comparison \ No newline at end of file +This repository implements a species distribution modeling comparison study for about 600 South American mammal species. Specifically, the + +## Project Structure + +- **`R/`**: Contains all the R scripts organized by workflow steps. +- **`Symobio_modeling.Rproj`**: RStudio project file for easy navigation. +- **`README.md`**: High-level overview of the project. +- **`renv/`**: Manages package dependencies for reproducibility. +- **`renv.lock`**: Lockfile for `renv` to ensure consistent package versions. + +## Workflow Overview + +The workflow is divided into several stages, each represented by scripts in the `R/` directory. Below is a summary of the key steps: + +### 1. Preparation of Geographic Data +- **`01_01_range_map_preparation.R`**: Processes IUCN mammal range maps for South America, converting them to a standardized raster format with consistent projection for downstream analysis. +- **`01_02_raster_preparation.R`**: Prepares environmental predictor rasters (e.g., climate, elevation, land cover) by cropping to South America extent, resampling to consistent resolution, and performing any necessary transformations. + +### 2. Preparation of complementary species-level data + + +- **`02_01_functional_group_assignment.R`**: Assigns mammal species to functional groups based on diet, locomotion, and body size characteristics, creating categorical variables for modeling. +- **`02_02_functional_traits_preparation.R`**: Cleans and standardizes continuous trait data (body mass, diet breadth, etc.) for all study species, handling missing values through imputation where necessary. +- **`02_03_phylo_preparation.R`**: Extracts phylogenetic information for target mammal species, computes phylogenetic distance matrices, and prepares the data for inclusion in models. + +### 3. Preparation of Presence/Absence Data +- **`03_01_presence_preparation.R`**: Processes occurrence records from GBIF and other sources, applies spatial filtering to reduce sampling bias, and aligns taxonomic nomenclature. +- **`03_02_absence_preparation.R`**: Generates pseudo-absence points using a stratified random approach, with constraints based on environmental conditions and range map boundaries. +- **`03_03_dataset_exploration.R`**: Produces descriptive statistics and visualizations of presence/absence data, environmental variables, and species coverage to assess data quality. +- **`03_04_model_data_finalization.R`**: Merges all prepared datasets (occurrences, absences, predictors) into final modeling datasets, splits data into training/testing sets, and applies any necessary scaling or transformations. + +### 4. Modeling +- **`04_01_modelling_ssdm.R`**: Implements traditional single-species distribution modeling (SSDM) approaches with selected algorithms (e.g., MaxEnt, random forests), including hyperparameter tuning. +- **`04_02_modelling_msdm_embed.R`**: Develops multi-species distribution models using neural network approaches that embed species identities into a latent space, capturing inter-species relationships. +- **`04_03_modelling_msdm_onehot.R`**: Implements multi-species distribution models using one-hot encoding for species identities, enabling joint prediction across all species simultaneously. +- **`04_04_modelling_msdm_rf.R`**: Implements random forest-based multi-species distribution modeling, incorporating species identity as a predictor variable alongside environmental variables. + +### 5. Analysis +- **`05_01_performance_report.qmd`**: Generates comprehensive reports on model performance metrics (AUC, TSS, etc.) for all modeling approaches, with visualizations comparing performance across species and methods. +- **`05_02_publication_analysis.qmd`**: Conducts advanced statistical analyses of model results, creates publication-quality figures, and summarizes findings for manuscript preparation. + +## Miscellaneous +- **`utils.R`**: Contains utility functions used across multiple scripts, including data processing helpers, custom evaluation metrics, and visualization functions. + +### Miscellaneous +- **`utils.R`**: + +## Getting Started + +1. Clone the repository and open the `Symobio_modeling.Rproj` file in RStudio. +2. Restore the project environment using `renv`: + ```r + renv::restore() + ``` +3. Run the scripts in the R/ directory sequentially. Some scripts, especially for model fitting, may run a long time and benefit from powerful hardware. + +## Additional Notes +- Ensure that all required input data (e.g., range maps, raster files) is available in the expected directories. +- Outputs from each script are typically saved to disk and used as inputs for subsequent scripts. +- Refer to the README.md file for any additional project-specific instructions. diff --git a/occurrences.png b/occurrences.png deleted file mode 100644 index 0a8e43d1943357ed699740edbce1198cfddb6b33..0000000000000000000000000000000000000000 Binary files a/occurrences.png and /dev/null differ