diff --git a/R/01_range_map_preparation.R b/R/01_range_map_preparation.R index 1ee112fb682464acce54712038a841606d43fada..0a0717690f5370dd4fa90ba24f8c60dde544688a 100644 --- a/R/01_range_map_preparation.R +++ b/R/01_range_map_preparation.R @@ -108,4 +108,6 @@ range_dist = mean_minimum_distances %>% column_to_rownames("species1") %>% as.matrix() +range_dist = range_dist / max(range_dist) # Scale to (0,1) + save(range_dist, file = "data/r_objects/range_dist.RData") diff --git a/R/04_04_msdm_embed_traits.R b/R/04_04_msdm_embed_traits.R index 9dd0b34ac126ae121c90aba426caac94db58f756..fce4d556361aeb8f157d6fb088a97327d433eb85 100644 --- a/R/04_04_msdm_embed_traits.R +++ b/R/04_04_msdm_embed_traits.R @@ -5,16 +5,16 @@ library(cito) source("R/utils.R") load("data/r_objects/model_data.RData") -load("data/r_objects/phylo_dist.RData") +load("data/r_objects/func_dist.RData") # ----------------------------------------------------------------------# # Prepare data #### # ----------------------------------------------------------------------# -model_species = intersect(model_data$species, colnames(phylo_dist)) +model_species = intersect(model_data$species, names(func_dist)) model_data_final = model_data %>% dplyr::filter(species %in% !!model_species) %>% - # dplyr::mutate_at(vars(starts_with("layer")), scale) %>% # Scaling seems to make things worse often + # dplyr::mutate_at(vars(starts_with("layer")), ~as.vector(scale(.))) %>% # Scaling seems to make things worse often dplyr::mutate(species_int = as.integer(as.factor(species))) train_data = dplyr::filter(model_data_final, train == 1) @@ -31,6 +31,8 @@ predictors = paste0("layer_", 1:19) # ----------------------------------------------------------------------# # 1. Train formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)")) + +plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there msdm_fit_embedding_traits_static = dnn( formula, data = train_data, @@ -87,6 +89,8 @@ save(msdm_results_embedding_traits_static, file = "data/r_objects/msdm_results_e # With training the embedding #### # ------------------------------------------------------------ ------# formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)")) + +plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there msdm_fit_embedding_traits_trained = dnn( formula, data = train_data, diff --git a/R/04_05_msdm_embed_phylo.R b/R/04_05_msdm_embed_phylo.R index df8b57ab681a51cedfd4f641123de24ee7f33838..11134f38f5859acd27bc4b2034b01f3db61530ce 100644 --- a/R/04_05_msdm_embed_phylo.R +++ b/R/04_05_msdm_embed_phylo.R @@ -31,6 +31,8 @@ predictors = paste0("layer_", 1:19) # ----------------------------------------------------------------------# # 1. Train formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)")) + +plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there msdm_fit_embedding_phylo_static = dnn( formula, data = train_data, @@ -87,6 +89,8 @@ save(msdm_results_embedding_phylo_static, file = "data/r_objects/msdm_results_em # With training the embedding #### # ------------------------------------------------------------ ------# formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)")) + +plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there msdm_fit_embedding_phylo_trained = dnn( formula, data = train_data, diff --git a/R/04_06_msdm_embed_ranges.R b/R/04_06_msdm_embed_ranges.R new file mode 100644 index 0000000000000000000000000000000000000000..b9bd65d3b3ab247a48dac3929b8f092730bcbe66 --- /dev/null +++ b/R/04_06_msdm_embed_ranges.R @@ -0,0 +1,144 @@ +library(dplyr) +library(tidyr) +library(cito) + +source("R/utils.R") + +load("data/r_objects/model_data.RData") +load("data/r_objects/range_dist.RData") + +# ----------------------------------------------------------------------# +# Prepare data #### +# ----------------------------------------------------------------------# +model_species = intersect(model_data$species, colnames(range_dist)) + +model_data_final = model_data %>% + dplyr::filter(species %in% !!model_species) %>% + # dplyr::mutate_at(vars(starts_with("layer")), scale) %>% # Scaling seems to make things worse often + dplyr::mutate(species_int = as.integer(as.factor(species))) + +train_data = dplyr::filter(model_data_final, train == 1) +test_data = dplyr::filter(model_data_final, train == 0) + +sp_ind = match(model_species, colnames(range_dist)) +range_dist = range_dist[sp_ind, sp_ind] + +embeddings = eigen(range_dist)$vectors[,1:20] +mode(embeddings) = "numeric" +predictors = paste0("layer_", 1:19) + +# ----------------------------------------------------------------------# +# Without training the embedding #### +# ----------------------------------------------------------------------# +# 1. Train +formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)")) + +plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there +msdm_fit_embedding_range_static = dnn( + formula, + data = train_data, + hidden = c(500L, 500L, 500L), + loss = "binomial", + activation = c("sigmoid", "leaky_relu", "leaky_relu"), + epochs = 15000L, + lr = 0.01, + baseloss = 1, + batchsize = nrow(train_data), + dropout = 0.1, + burnin = 100, + optimizer = config_optimizer("adam", weight_decay = 0.001), + lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), + early_stopping = 250, + validation = 0.3, + device = "cuda", +) +save(msdm_fit_embedding_range_static, file = "data/r_objects/msdm_fit_embedding_range_static.RData") + +# 2. Evaluate +# Per species +load("data/r_objects/msdm_fit_embedding_range_static.RData") +data_split = test_data %>% + group_by(species_int) %>% + group_split() + +msdm_results_embedding_range_static = lapply(data_split, function(data_spec){ + target_species = data_spec$species[1] + data_spec = dplyr::select(data_spec, -species) + + msdm_performance = tryCatch({ + evaluate_model(msdm_fit_embedding_range_static, data_spec) + }, error = function(e){ + list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) + }) + + performance_summary = tibble( + species = !!target_species, + obs = length(which(model_data$species == target_species)), + model = "MSDM_embed_informed_range_static", + auc = msdm_performance$AUC, + accuracy = msdm_performance$Accuracy, + kappa = msdm_performance$Kappa, + precision = msdm_performance$Precision, + recall = msdm_performance$Recall, + f1 = msdm_performance$F1 + ) +}) %>% bind_rows() + +save(msdm_results_embedding_range_static, file = "data/r_objects/msdm_results_embedding_range_static.RData") + +# -------------------------------------------------------------------# +# With training the embedding #### +# ------------------------------------------------------------ ------# +formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)")) + +plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there +msdm_fit_embedding_range_trained = dnn( + formula, + data = train_data, + hidden = c(500L, 500L, 500L), + loss = "binomial", + activation = c("sigmoid", "leaky_relu", "leaky_relu"), + epochs = 15000L, + lr = 0.01, + baseloss = 1, + batchsize = nrow(train_data), + dropout = 0.1, + burnin = 100, + optimizer = config_optimizer("adam", weight_decay = 0.001), + lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), + early_stopping = 250, + validation = 0.3, + device = "cuda", +) +save(msdm_fit_embedding_range_trained, file = "data/r_objects/msdm_fit_embedding_range_trained.RData") + +# 2. Evaluate +load("data/r_objects/msdm_fit_embedding_range_trained.RData") +data_split = test_data %>% + group_by(species_int) %>% + group_split() + +msdm_results_embedding_range_trained = lapply(data_split, function(data_spec){ + target_species = data_spec$species[1] + data_spec = dplyr::select(data_spec, -species) + + msdm_performance = tryCatch({ + evaluate_model(msdm_fit_embedding_range_trained, data_spec) + }, error = function(e){ + list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) + }) + + performance_summary = tibble( + species = !!target_species, + obs = length(which(model_data$species == target_species)), + model = "MSDM_embed_informed_range_trained", + auc = msdm_performance$AUC, + accuracy = msdm_performance$Accuracy, + kappa = msdm_performance$Kappa, + precision = msdm_performance$Precision, + recall = msdm_performance$Recall, + f1 = msdm_performance$F1 + ) +}) %>% bind_rows() + +save(msdm_results_embedding_range_trained, file = "data/r_objects/msdm_results_embedding_range_trained.RData") diff --git a/R/05_performance_analysis.qmd b/R/05_01_performance_analysis.qmd similarity index 97% rename from R/05_performance_analysis.qmd rename to R/05_01_performance_analysis.qmd index 83b3216173a38242c2462c62d1c14bf57554c9bf..2fea07f540a92561912fe6a7cc07aae8ca55ac9d 100644 --- a/R/05_performance_analysis.qmd +++ b/R/05_01_performance_analysis.qmd @@ -1,5 +1,5 @@ --- -title: "sSDM Performance analysis" +title: "SDM Performance analysis" format: html editor: visual engine: knitr @@ -66,7 +66,7 @@ performance = bind_rows(ssdm_results, msdm_results, msdm_results_embedding_train ## Summary -This document summarizes the performance of different sSDM and mMSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest. +This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest. Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling). @@ -529,7 +529,7 @@ bslib::card(plot, full_screen = T) Functional groups were assigned based on taxonomic order. The following groupings were used: | Functional group | Taxomic orders | -|--------------------------|----------------------------------------------| +|-------------------|-----------------------------------------------------| | large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla | | small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha | | arboreal | Primates, Pilosa | diff --git a/R/05_02_MSDM_comparison.qmd b/R/05_02_MSDM_comparison.qmd new file mode 100644 index 0000000000000000000000000000000000000000..667edb695e5cae12e2de2fff9ea5504d54f51ad0 --- /dev/null +++ b/R/05_02_MSDM_comparison.qmd @@ -0,0 +1,66 @@ +--- +title: "mSDM comparison" +format: html +editor: visual +engine: knitr +--- + +```{r init, echo = FALSE, include = FALSE} +library(tidyverse) +library(sf) +library(plotly) +library(DT) + +load("../data/r_objects/msdm_results_embedding_raw.RData") +load("../data/r_objects/msdm_results_embedding_traits_static.RData") +load("../data/r_objects/msdm_results_embedding_traits_trained.RData") +load("../data/r_objects/msdm_results_embedding_phylo_static.RData") +load("../data/r_objects/msdm_results_embedding_phylo_trained.RData") +load("../data/r_objects/msdm_results_embedding_range_static.RData") +load("../data/r_objects/msdm_results_embedding_range_trained.RData") + +sf::sf_use_s2(use_s2 = FALSE) +``` + +```{r globals, echo = FALSE, include = FALSE} +results_embedding_raw = msdm_results_embedding_raw %>% + dplyr::mutate( + embedding = "raw", + training = "trained" + ) + +results_embedding_informed = c( + "msdm_results_embedding_phylo_static", + "msdm_results_embedding_phylo_trained", + "msdm_results_embedding_range_static", + "msdm_results_embedding_range_trained", + "msdm_results_embedding_traits_static", + "msdm_results_embedding_traits_trained" +) + +results_embedding_informed_merged = lapply(results_embedding_informed, function(df_name){ + df = get(df_name) + name_split = str_split_1(df_name, pattern = "_") + df$embedding = name_split[4] + df$training = name_split[5] + assign(df_name, df) +}) %>% bind_rows() + +results_final = results_embedding_raw %>% + bind_rows(results_embedding_informed_merged) %>% + drop_na() +``` + +```{r globals, echo = FALSE} +auc_overview = results_final %>% + pivot_wider(names_from = model, values_from = auc, id_cols = c(species, obs)) %>% + dplyr::arrange(obs) + +accuracy_overview = results_final %>% + pivot_wider(names_from = model, values_from = accuracy, id_cols = c(species, obs)) %>% + dplyr::arrange(obs) + +f1_overview = results_final %>% + pivot_wider(names_from = model, values_from = f1, id_cols = c(species, obs)) %>% + dplyr::arrange(obs) +``` \ No newline at end of file