From 496eb761991486175ce3163a0c226757c171a789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Wed, 27 Nov 2024 09:16:02 +0100 Subject: [PATCH] phylo dist calculation --- R/01_range_map_preparation.R | 66 +++++++++++++++++-- R/02_03_phylo_preparation.R | 50 ++++++++++++++ ...02_msdm_embed.R => 04_02_msdm_embed_raw.R} | 27 ++++---- ...d_informed.R => 04_03_msdm_embed_traits.R} | 48 +++++++------- R/05_performance_analysis.qmd | 8 +-- 5 files changed, 153 insertions(+), 46 deletions(-) create mode 100644 R/02_03_phylo_preparation.R rename R/{04_02_msdm_embed.R => 04_02_msdm_embed_raw.R} (72%) rename R/{04_03_msdm_embed_informed.R => 04_03_msdm_embed_traits.R} (74%) diff --git a/R/01_range_map_preparation.R b/R/01_range_map_preparation.R index 120b286..1ee112f 100644 --- a/R/01_range_map_preparation.R +++ b/R/01_range_map_preparation.R @@ -2,9 +2,13 @@ library(tidyverse) library(Symobio) library(sf) library(rnaturalearth) +library(Rfast) sf::sf_use_s2(use_s2 = FALSE) +# --------------------------------------# +# Process range maps #### +# --------------------------------------# # Load range maps range_maps = st_read( "~/share/groups/mas_data/Saved_Data_Dropbox_Business/Datensätze/Range Maps/IUCN_range_maps_mammals_version2016/TERRESTRIAL_MAMMALS.shp", @@ -34,26 +38,74 @@ range_maps_names_matched = lapply(unique(range_maps$binomial), function(name){ }, error = function(e){ return(NULL) }) -}) +}) %>% + bind_rows() + save(range_maps_names_matched, file = "data/r_objects/range_maps_names_matched.RData") # Subset range maps to target species and focal region -names_matched = Filter(is.data.frame, range_maps_names_matched) %>% - bind_rows() - range_maps = range_maps %>% - inner_join(names_matched, by = c("binomial" = "name_orig")) %>% + inner_join(range_maps_names_matched, by = c("binomial" = "name_orig")) %>% group_by(name_matched) %>% summarize(geometry = suppressMessages(st_union(geometry))) save(range_maps, file = "data/r_objects/range_maps.RData") -# Gridded range maps +# -------------------------------------------# +# Process gridded range maps #### +# -------------------------------------------# range_maps_gridded = rnaturalearth::ne_countries() %>% dplyr::filter(continent == "South America") %>% sf::st_union() %>% st_make_grid(square = FALSE, cellsize = 1) %>% st_sf() %>% - st_join(range_maps, st_intersects, left = F) + st_join(range_maps, st_intersects, left = F) %>% + na.omit() save(range_maps_gridded, file = "data/r_objects/range_maps_gridded.RData") + +# ----------------------------------------------# +# Calculate range dissimilarity #### +# ----------------------------------------------# +load("data/r_objects/range_maps_gridded.RData") + +range_maps_gridded_id = range_maps_gridded %>% + dplyr::group_by(geometry) %>% + dplyr::mutate(geom_id = cur_group_id()) %>% + ungroup() + +geometries_unique = range_maps_gridded_id %>% + dplyr::select(-name_matched) %>% + group_by(geom_id) %>% + slice_head(n = 1) + +geom_dist = sf::st_distance(geometries_unique, geometries_unique) # Takes ~ 10 mins + %>% as.matrix() + +range_maps_split = range_maps_gridded_id %>% + group_by(name_matched) %>% + group_split() + +mean_minimum_distances = lapply(range_maps_split, function(df1){ # Takes ~ 30 mins + df_dist = lapply(range_maps_split, function(df2){ + dists_subset = geom_dist[df1$geom_id, df2$geom_id, drop = F] + dist = mean(Rfast::rowMins(dists_subset, value = T)) # Mean minimum distance + + df_result = data.frame( + species1 = df1$name_matched[1], + species2 = df2$name_matched[1], + dist = dist + ) + + return(df_result) + }) %>% bind_rows() + + return(df_dist) +}) %>% bind_rows() + +range_dist = mean_minimum_distances %>% + pivot_wider(names_from = species2, values_from = dist) %>% + column_to_rownames("species1") %>% + as.matrix() + +save(range_dist, file = "data/r_objects/range_dist.RData") diff --git a/R/02_03_phylo_preparation.R b/R/02_03_phylo_preparation.R new file mode 100644 index 0000000..f004993 --- /dev/null +++ b/R/02_03_phylo_preparation.R @@ -0,0 +1,50 @@ +library(tidyverse) +library(phytools) +library(ape) +library(Symobio) + +forest = read.nexus("data/phylogenies/Small_phylogeny.nex") +load("data/r_objects/range_maps_names_matched.RData") + +# Get taxonomic information for target species +phylo_names_unique = unique(forest$UNTITLED$tip.label) + +phylo_names_matched = lapply(phylo_names_unique, function(name){ + tryCatch({ + name_query = stringr::str_replace_all(name, "_", " ") + match_result = Symobio::gbif_match_name(name = name_query) + if(match_result$status != "ACCEPTED"){ + match_result = Symobio::gbif_match_name(usageKey = match_result$acceptedUsageKey) + } + + name_matched = if("species" %in% names(match_result)) match_result$species else NA + data.frame(name_orig = name, name_matched = name_matched) + }, error = function(e){ + return(NULL) + }) +}) %>% bind_rows() + +save(phylo_names_matched, file = "data/r_objects/phylo_names_matched.RData") + +# Trim forest to target species +phylo_names_target = dplyr::filter(phylo_names_matched, name_matched %in% range_maps_names_matched$name_matched) + +forest_pruned = lapply(forest, function(x) { + labels_drop = setdiff(x$tip.label, phylo_names_target$name_orig) + x_pruned = drop.tip(x, labels_drop) + labels_new = phylo_names_matched$name_matched[match(phylo_names_target$name_orig, x_pruned$tip.label)] + x_pruned$tip.label = labels_new + return(x_pruned) +}) + +class(forest_pruned) <- "multiPhylo" + +# Calculate pairwise phylogenetic distances across random sample of forests +indices = sample(length(forest_pruned), 50) +dists = lapply(indices, function(i){ + cophenetic.phylo(forest_pruned[[i]]) +}) + +# Save result +phylo_dist = Reduce("+", dists) / length(dists) +save(phylo_dist, file = "data/r_objects/phylo_dist.RData") \ No newline at end of file diff --git a/R/04_02_msdm_embed.R b/R/04_02_msdm_embed_raw.R similarity index 72% rename from R/04_02_msdm_embed.R rename to R/04_02_msdm_embed_raw.R index 6c825fd..6e7924a 100644 --- a/R/04_02_msdm_embed.R +++ b/R/04_02_msdm_embed_raw.R @@ -12,45 +12,46 @@ train_data = dplyr::filter(model_data, train == 1) test_data = dplyr::filter(model_data, train == 0) # ----------------------------------------------------------------------# -# Train model #### +# Train model #### # ----------------------------------------------------------------------# predictors = paste0("layer_", 1:19) -formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, dim = 20, lambda = 0.000001)")) +formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, dim = 50, lambda = 0.000001)")) -msdm_fit = dnn( +plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there +msdm_fit_embedding_raw = dnn( formula, data = train_data, - hidden = c(500L, 1000L, 1000L), + hidden = c(500L, 500L, 500L), loss = "binomial", activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 25000L, + epochs = 15000L, lr = 0.01, baseloss = 1, - batchsize = nrow(train_data)/5, - dropout = 0.25, + batchsize = nrow(train_data), + dropout = 0.1, burnin = 100, optimizer = config_optimizer("adam", weight_decay = 0.001), lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), early_stopping = 250, validation = 0.3, - device = "cuda" + device = "cuda", ) -save(msdm_fit, file = "data/r_objects/msdm_fit2.RData") +save(msdm_fit_embedding_raw, file = "data/r_objects/msdm_fit_embedding_raw.RData") # ----------------------------------------------------------------------# # Evaluate model #### # ----------------------------------------------------------------------# -load("data/r_objects/msdm_fit.RData") +load("data/r_objects/msdm_fit_embedding_raw.RData") data_split = split(model_data, model_data$species) -msdm_results = lapply(data_split, function(data_spec){ +msdm_results_embedding_raw = lapply(data_split, function(data_spec){ test_data = dplyr::filter(data_spec, train == 0) %>% dplyr::select(-species) msdm_performance = tryCatch({ - evaluate_model(msdm_fit, test_data) + evaluate_model(msdm_fit_embedding_raw, test_data) }, error = function(e){ list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) }) @@ -68,4 +69,4 @@ msdm_results = lapply(data_split, function(data_spec){ ) }) %>% bind_rows() -save(msdm_results, file = "data/r_objects/msdm_results.RData") +save(msdm_results_embedding_raw, file = "data/r_objects/msdm_results_embedding_raw.RData") diff --git a/R/04_03_msdm_embed_informed.R b/R/04_03_msdm_embed_traits.R similarity index 74% rename from R/04_03_msdm_embed_informed.R rename to R/04_03_msdm_embed_traits.R index 718a8e9..774121e 100644 --- a/R/04_03_msdm_embed_informed.R +++ b/R/04_03_msdm_embed_traits.R @@ -31,37 +31,39 @@ predictors = paste0("layer_", 1:19) # ----------------------------------------------------------------------# # 1. Train formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)")) -msdm_fit_embedding_untrained = dnn( +msdm_fit_embedding_traits_static = dnn( formula, data = train_data, - hidden = c(200L, 200L, 200L), + hidden = c(500L, 500L, 500L), loss = "binomial", activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 12000L, - lr = 0.01, - batchsize = nrow(train_data)/5, + epochs = 15000L, + lr = 0.01, + baseloss = 1, + batchsize = nrow(train_data), dropout = 0.1, burnin = 100, optimizer = config_optimizer("adam", weight_decay = 0.001), lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), early_stopping = 250, validation = 0.3, - device = "cuda" + device = "cuda", ) -save(msdm_fit_embedding_untrained, file = "data/r_objects/msdm_fit_embedding_untrained.RData") +save(msdm_fit_embedding_traits_static, file = "data/r_objects/msdm_fit_embedding_traits_static.RData") # 2. Evaluate # Per species +load("data/r_objects/msdm_fit_embedding_traits_static.RData") data_split = test_data %>% group_by(species_int) %>% group_split() -msdm_results_embedding_untrained = lapply(data_split, function(data_spec){ +msdm_results_embedding_traits_static = lapply(data_split, function(data_spec){ target_species = data_spec$species[1] data_spec = dplyr::select(data_spec, -species) msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_untrained, data_spec) + evaluate_model(msdm_fit_embedding_traits_static, data_spec) }, error = function(e){ list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) }) @@ -69,7 +71,7 @@ msdm_results_embedding_untrained = lapply(data_split, function(data_spec){ performance_summary = tibble( species = !!target_species, obs = length(which(model_data$species == target_species)), - model = "MSDM_embed_informed_untrained", + model = "MSDM_embed_informed_traits_static", auc = msdm_performance$AUC, accuracy = msdm_performance$Accuracy, kappa = msdm_performance$Kappa, @@ -79,42 +81,44 @@ msdm_results_embedding_untrained = lapply(data_split, function(data_spec){ ) }) %>% bind_rows() -save(msdm_results_embedding_untrained, file = "data/r_objects/msdm_results_embedding_untrained.RData") +save(msdm_results_embedding_traits_static, file = "data/r_objects/msdm_results_embedding_traits_static.RData") # -------------------------------------------------------------------# # With training the embedding #### # ------------------------------------------------------------ ------# formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)")) -msdm_fit_embedding_trained = dnn( +msdm_fit_embedding_traits_trained = dnn( formula, data = train_data, - hidden = c(200L, 200L, 200L), + hidden = c(500L, 500L, 500L), loss = "binomial", activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 12000L, - lr = 0.01, - batchsize = nrow(train_data)/5, + epochs = 15000L, + lr = 0.01, + baseloss = 1, + batchsize = nrow(train_data), dropout = 0.1, burnin = 100, optimizer = config_optimizer("adam", weight_decay = 0.001), lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), early_stopping = 250, validation = 0.3, - device = "cuda" + device = "cuda", ) -save(msdm_fit_embedding_trained, file = "data/r_objects/msdm_fit_embedding_trained.RData") +save(msdm_fit_embedding_traits_trained, file = "data/r_objects/msdm_fit_embedding_traits_trained.RData") # 2. Evaluate +load("data/r_objects/msdm_fit_embedding_traits_trained.RData") data_split = test_data %>% group_by(species_int) %>% group_split() -msdm_results_embedding_trained = lapply(data_split, function(data_spec){ +msdm_results_embedding_traits_trained = lapply(data_split, function(data_spec){ target_species = data_spec$species[1] data_spec = dplyr::select(data_spec, -species) msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_trained, data_spec) + evaluate_model(msdm_fit_embedding_traits_trained, data_spec) }, error = function(e){ list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) }) @@ -122,7 +126,7 @@ msdm_results_embedding_trained = lapply(data_split, function(data_spec){ performance_summary = tibble( species = !!target_species, obs = length(which(model_data$species == target_species)), - model = "MSDM_embed_informed_trained", + model = "MSDM_embed_informed_traits_trained", auc = msdm_performance$AUC, accuracy = msdm_performance$Accuracy, kappa = msdm_performance$Kappa, @@ -132,4 +136,4 @@ msdm_results_embedding_trained = lapply(data_split, function(data_spec){ ) }) %>% bind_rows() -save(msdm_results_embedding_trained, file = "data/r_objects/msdm_results_embedding_trained.RData") +save(msdm_results_embedding_traits_trained, file = "data/r_objects/msdm_results_embedding_traits_trained.RData") diff --git a/R/05_performance_analysis.qmd b/R/05_performance_analysis.qmd index ca81a81..83b3216 100644 --- a/R/05_performance_analysis.qmd +++ b/R/05_performance_analysis.qmd @@ -15,7 +15,7 @@ library(DT) load("../data/r_objects/ssdm_results.RData") load("../data/r_objects/msdm_fit.RData") load("../data/r_objects/msdm_results.RData") -load("../data/r_objects/msdm_results_embedding_trained.RData.") +load("../data/r_objects/msdm_results_embedding_trained.RData") load("../data/r_objects/msdm_results_multiclass.RData.") load("../data/r_objects/range_maps.RData") load("../data/r_objects/range_maps_gridded.RData") @@ -110,11 +110,11 @@ Binary Neural Network with species embedding (**MSDM_embed**) - embedding initialized at random - three hidden layers, sigmoid + leaky ReLu activations, binomial loss -Binary Neural Network with trait-informed species embedding (**MSDM_embed_traits**) +Binary Neural Network with trait-informed species embedding (**MSDM_embed_informed_trained**) - definition: presence \~ environment + embedding(species) - prediction: probability of occurrence given a set of (environmental) inputs and species identity -- embedding initialized using eigenvectors of functional distance matrix +- embedding initialized using eigenvectors of functional distance matrix, then further training on data - three hidden layers, sigmoid + leaky ReLu activations, binomial loss Multi-Class Neural Network (**MSDM_multiclass**) @@ -529,7 +529,7 @@ bslib::card(plot, full_screen = T) Functional groups were assigned based on taxonomic order. The following groupings were used: | Functional group | Taxomic orders | -|-----------------------|-----------------------------------------------------------------------| +|--------------------------|----------------------------------------------| | large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla | | small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha | | arboreal | Primates, Pilosa | -- GitLab