From 2cfd549197dc09a54d7c0e68ac39c11df4f0213c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Wed, 15 Jan 2025 14:57:57 +0100 Subject: [PATCH] some changes to modeling pipelines and quick additions for updated report --- R/01_01_range_map_preparation.R | 4 +- R/03_02_absence_preparation.R | 17 +- R/04_01_ssdm_modeling.R | 67 ++++---- R/04_03_msdm_embed_raw.R | 40 +++-- R/04_07_msdm_embed_multi_nolonlat.R | 20 ++- R/04_07_msdm_oneoff.R | 111 +++++++++++++ R/05_01_performance_analysis.qmd | 189 +++++++++++----------- R/05_01_performance_analysis_carsten.qmd | 190 +++++++++++++++++++++++ R/_publish.yml | 4 + R/utils.R | 2 +- renv.lock | 151 ++++++++++++++++++ 11 files changed, 637 insertions(+), 158 deletions(-) create mode 100644 R/04_07_msdm_oneoff.R create mode 100644 R/05_01_performance_analysis_carsten.qmd diff --git a/R/01_01_range_map_preparation.R b/R/01_01_range_map_preparation.R index 0a07176..eb33fdc 100644 --- a/R/01_01_range_map_preparation.R +++ b/R/01_01_range_map_preparation.R @@ -79,8 +79,8 @@ geometries_unique = range_maps_gridded_id %>% group_by(geom_id) %>% slice_head(n = 1) -geom_dist = sf::st_distance(geometries_unique, geometries_unique) # Takes ~ 10 mins - %>% as.matrix() +geom_dist = sf::st_distance(geometries_unique, geometries_unique) %>% # Takes ~ 10 mins + as.matrix() range_maps_split = range_maps_gridded_id %>% group_by(name_matched) %>% diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R index 914ddb7..b148c64 100644 --- a/R/03_02_absence_preparation.R +++ b/R/03_02_absence_preparation.R @@ -49,9 +49,9 @@ range_maps = st_transform(range_maps, proj_string) # geographically close (reproduce sampling biases) but environmentally # # dissimilar (avoid false negatives) to the known occurrences # # ---------------------------------------------------------------------------# -future::plan("multisession", workers = 16) +future::plan("multisession", workers = 24) -model_data = furrr::future_walk( +furrr::future_walk( .x = target_species, .options = furrr::furrr_options(seed = 42, scheduling = FALSE), # make sure workers get constant stream of work .env_globals = c(raster_filepaths, sa_polygon, occs_final, range_maps, proj_string), @@ -126,13 +126,13 @@ model_data = furrr::future_walk( y = y + runif(nrow(.), -5000, 5000)) %>% # Add jitter (res/2) to cell centroids st_as_sf(coords = c("x", "y"), crs = proj_string) - sample_abs = bind_cols( - terra::extract(raster_data, terra::vect(sample_points), ID = FALSE) - ) %>% - bind_cols(sample_points) %>% + sample_abs = sample_points %>% + bind_cols( + terra::extract(raster_data, terra::vect(sample_points), ID = FALSE) + ) %>% drop_na() - abs_spec_list[[length(abs_spec_list)+1]] = sample_points + abs_spec_list[[length(abs_spec_list)+1]] = sample_abs samples_required = samples_required - nrow(sample_points) # Sometimes there are no env data for sample points, so keep sampling } @@ -154,7 +154,7 @@ model_data = furrr::future_walk( # Create presence-absence dataframe pa_spec = occs_spec %>% dplyr::mutate(present = 1) %>% - bind_rows(abs_spec) + bind_rows(abs_spec) ggplot() + ggtitle(species) + @@ -189,7 +189,6 @@ model_data = furrr::future_walk( }) pa_spec$fold_eval = folds - pa_spec$geometry = NULL save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData")) } diff --git a/R/04_01_ssdm_modeling.R b/R/04_01_ssdm_modeling.R index 5a0de5e..914c1aa 100644 --- a/R/04_01_ssdm_modeling.R +++ b/R/04_01_ssdm_modeling.R @@ -1,27 +1,30 @@ library(furrr) library(progressr) library(dplyr) +library(cito) source("R/utils.R") # ----------------------------------------------------------------------# # Define Training function #### # ----------------------------------------------------------------------# -train_models = function(data_split){ - future::plan("multisession", workers = 16) - p = progressor(along = data_split) +train_models = function(pa_files){ + future::plan("multisession", workers = 4) + p = progressor(along = pa_files) - ssdm_results_list = furrr::future_map( - .x = data_split, + furrr::future_walk( + .x = pa_files, .options = furrr::furrr_options( seed = 123, - packages = c("dplyr", "tidyr", "sf", "caret", "cito", "pROC") + packages = c("dplyr", "tidyr", "sf", "caret", "cito", "pROC"), + scheduling = FALSE # make sure workers get constant stream of work ), - .f = function(data_spec){ - spec = data_spec$species[1] - p(sprintf("spec=%s", spec)) + .f = function(pa_file){ + load(pa_file) + species = pa_spec$species[1] + p(sprintf("species=%s", species)) - if(all(is.na(data_spec$fold_eval))){ + if(all(is.na(pa_spec$fold_eval))){ warning("Too few samples") return() } @@ -37,12 +40,12 @@ train_models = function(data_split){ ) # Create factor presence column - data_spec$present_fct = factor(data_spec$present, levels = c("0", "1"), labels = c("A", "P")) + pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P")) # Outer CV loop (for averaging performance metrics) - performance_cv = lapply(sort(unique(data_spec$fold_eval)), function(k){ - data_test = dplyr::filter(data_spec, fold_eval == k) - data_train = dplyr::filter(data_spec, fold_eval != k) + performance_cv = lapply(sort(unique(pa_spec$fold_eval)), function(k){ + data_test = dplyr::filter(pa_spec, fold_eval == k) + data_train = dplyr::filter(pa_spec, fold_eval != k) if(nrow(data_test) == 0 || nrow(data_train) == 0){ warning("Too few samples") @@ -75,7 +78,7 @@ train_models = function(data_split){ ) # Define predictors - predictors = paste0("layer_", 1:19) + predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") # Random Forest ##### rf_performance = tryCatch({ @@ -84,7 +87,6 @@ train_models = function(data_split){ x = data_train[, predictors], y = data_train$present_fct, method = "rf", - metric = "AUC", trControl = train_ctrl, tuneLength = 4, verbose = F @@ -101,7 +103,6 @@ train_models = function(data_split){ x = data_train[, predictors], y = data_train$present_fct, method = "gbm", - metric = "AUC", trControl = train_ctrl, tuneLength = 4, verbose = F @@ -117,7 +118,6 @@ train_models = function(data_split){ x = data_train[, predictors], y = data_train$present_fct, method = "gamSpline", - metric = "AUC", tuneLength = 4, trControl = train_ctrl ) @@ -136,7 +136,7 @@ train_models = function(data_split){ hidden = c(100L, 100L, 100L), loss = "binomial", activation = c("leaky_relu", "leaky_relu", "leaky_relu"), - epochs = 400L, + epochs = 500L, burnin = 100L, lr = 0.001, batchsize = max(nrow(data_train)/10, 64), @@ -150,14 +150,18 @@ train_models = function(data_split){ plot = F ) - evaluate_model(nn_fit, data_test) + if(nn_fit$successfull == 1){ + evaluate_model(nn_fit, data_test) + } else { + na_performance + } }, error = function(e){ na_performance }) # Summarize results performance_summary = tibble( - species = spec, + species = !!species, obs = nrow(data_train), fold_eval = k, model = c("RF", "GBM", "GAM", "NN"), @@ -175,8 +179,7 @@ train_models = function(data_split){ # Combine and save evaluation results performance_spec = bind_rows(performance_cv) - filename = paste0(gsub(" ", "_", spec), "_results.RData") - save(performance_spec, file = file.path("data/r_objects/nested_cv/model_results/", filename)) + save(performance_spec, file = paste0("data/r_objects/model_results/", species, ".RData")) } ) } @@ -184,22 +187,22 @@ train_models = function(data_split){ # ----------------------------------------------------------------------# # Run training #### # ----------------------------------------------------------------------# -load("data/r_objects/nested_cv/model_data.RData") - handlers(global = TRUE) handlers("progress") -data_split = group_split(model_data, species) +pa_files = list.files("data/r_objects/pa_sampling/", full.names = T) +species_processed = stringr::str_remove_all(list.files("data/r_objects/model_results/"), ".RData") +pa_files_run = pa_files[!sapply(pa_files, function(x){any(stringr::str_detect(x, species_processed))})] -train_models(data_split) +train_models(pa_files_run) # ----------------------------------------------------------------------# # Combine results #### # ----------------------------------------------------------------------# -files = list.files("data/r_objects/nested_cv/model_results/", full.names = T) +files = list.files("data/r_objects/model_results/", full.names = T) ssdm_results = lapply(files, function(f){load(f); return(performance_spec)}) %>% - bind_rows() %>% - dplyr::group_by(species, model, metric) %>% - dplyr::summarize(value = mean(value, na.rm =T)) + bind_rows() + #dplyr::group_by(species, model, metric) %>% + #dplyr::summarize(value = mean(value, na.rm =T)) -save(ssdm_results, file = "data/r_objects/nested_cv/ssdm_results.RData") +save(ssdm_results, file = "data/r_objects/ssdm_results.RData") diff --git a/R/04_03_msdm_embed_raw.R b/R/04_03_msdm_embed_raw.R index 03a8737..6b2df02 100644 --- a/R/04_03_msdm_embed_raw.R +++ b/R/04_03_msdm_embed_raw.R @@ -4,36 +4,40 @@ library(cito) source("R/utils.R") -load("data/r_objects/model_data.RData") +load("data/r_objects/model_data_pa_sampling.RData") -model_data$species_int = as.integer(as.factor(model_data$species)) - -train_data = dplyr::filter(model_data, train == 1) -test_data = dplyr::filter(model_data, train == 0) +model_data = model_data %>% + dplyr::mutate(species_int = as.integer(as.factor(model_data$species))) %>% + sf::st_drop_geometry() + +test_data = dplyr::filter(model_data, fold_eval == 1) %>% + dplyr::select(-fold_eval) +train_data = dplyr::filter(model_data, fold_eval != 1) %>% + dplyr::select(-fold_eval) # ----------------------------------------------------------------------# # Train model #### # ----------------------------------------------------------------------# -predictors = paste0("layer_", 1:19) -formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, dim = 50, lambda = 0.000001)")) +predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") +formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, dim = 50, train = T, lambda = 0.0001)")) -plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there +plot(1, type="n", xlab="", ylab="", xlim=c(0, 25000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there msdm_fit_embedding_raw = dnn( formula, data = train_data, hidden = c(500L, 500L, 500L), loss = "binomial", activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 15000L, + epochs = 10L, lr = 0.01, baseloss = 1, batchsize = nrow(train_data), dropout = 0.1, - burnin = 100, + burnin = 500, optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 250, - validation = 0.3, + lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 150, factor = 0.7), + early_stopping = 400, + validation = 0.2, device = "cuda" ) @@ -44,20 +48,22 @@ save(msdm_fit_embedding_raw, file = "data/r_objects/msdm_fit_embedding_raw.RData # ----------------------------------------------------------------------# load("data/r_objects/msdm_fit_embedding_raw.RData") -data_split = split(model_data, model_data$species) +data_split = test_data %>% + split(test_data$species) msdm_results_embedding_raw = lapply(data_split, function(data_spec){ - test_data = dplyr::filter(data_spec, train == 0) %>% + species = data_spec$species[1] + data_spec = data_spec %>% dplyr::select(-species) msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_raw, test_data) + evaluate_model(msdm_fit_embedding_raw, data_spec) }, error = function(e){ list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) }) performance_summary = tibble( - species = data_spec$species[1], + species = !!species, obs = nrow(data_spec), model = "MSDM_embed", auc = msdm_performance$AUC, diff --git a/R/04_07_msdm_embed_multi_nolonlat.R b/R/04_07_msdm_embed_multi_nolonlat.R index 2044e03..15d37e9 100644 --- a/R/04_07_msdm_embed_multi_nolonlat.R +++ b/R/04_07_msdm_embed_multi_nolonlat.R @@ -1,10 +1,11 @@ library(dplyr) library(tidyr) library(cito) +library(sf) source("R/utils.R") -load("data/r_objects/model_data.RData") +load("data/r_objects/model_data_pa_sampling.RData") load("data/r_objects/func_dist.RData") load("data/r_objects/phylo_dist.RData") load("data/r_objects/range_dist.RData") @@ -18,11 +19,13 @@ model_species = Reduce( ) model_data_final = model_data %>% + sf::st_drop_geometry() %>% dplyr::filter(species %in% !!model_species) %>% dplyr::mutate(species_int = as.integer(as.factor(species))) + -train_data = dplyr::filter(model_data_final, train == 1) -test_data = dplyr::filter(model_data_final, train == 0) +test_data = dplyr::filter(model_data_final, fold_eval == 1) +train_data = dplyr::filter(model_data_final, fold_eval != 1) # Create embeddings func_ind = match(model_species, colnames(func_dist)) @@ -41,14 +44,15 @@ range_embeddings = eigen(range_dist)$vectors[,1:20] # ----------------------------------------------------------------------# # Train model #### # ----------------------------------------------------------------------# -predictors = paste0("layer_", 1:19) +# Define predictors +predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") formula = as.formula( paste0("present ~ ", paste(predictors, collapse = '+'), - " + e(species_int, weights = func_embeddings, lambda = 0.00001, train = F)", - " + e(species_int, weights = phylo_embeddings, lambda = 0.00001, train = F)", - " + e(species_int, weights = range_embeddings, lambda = 0.00001, train = F)" + " + e(species_int, weights = func_embeddings, lambda = 0.00001, train = T)", + " + e(species_int, weights = phylo_embeddings, lambda = 0.00001, train = T)", + " + e(species_int, weights = range_embeddings, lambda = 0.00001, train = T)" ) ) @@ -56,7 +60,7 @@ plot(1, type="n", xlab="", ylab="", xlim=c(0, 25000), ylim=c(0, 0.7)) # empty pl msdm_fit_embedding_multi_nolonlat = dnn( formula, data = train_data, - hidden = c(500L, 500L, 500L), + hidden = c(200L, 200L, 200L), loss = "binomial", activation = c("sigmoid", "leaky_relu", "leaky_relu"), epochs = 30000L, diff --git a/R/04_07_msdm_oneoff.R b/R/04_07_msdm_oneoff.R new file mode 100644 index 0000000..4ef2a03 --- /dev/null +++ b/R/04_07_msdm_oneoff.R @@ -0,0 +1,111 @@ +library(dplyr) +library(tidyr) +library(cito) +library(sf) + +source("R/utils.R") + +load("data/r_objects/model_data_pa_sampling.RData") +load("data/r_objects/func_dist.RData") +load("data/r_objects/phylo_dist.RData") +load("data/r_objects/range_dist.RData") + +# ----------------------------------------------------------------------# +# Prepare data #### +# ----------------------------------------------------------------------# +model_species = Reduce( + intersect, + list(unique(model_data$species), colnames(range_dist), colnames(phylo_dist), colnames(func_dist)) +) + +model_data_final = model_data %>% + sf::st_drop_geometry() %>% + dplyr::filter(species %in% !!model_species) %>% + dplyr::mutate(species_int = as.integer(as.factor(species))) + + +test_data = dplyr::filter(model_data_final, fold_eval == 1) +train_data = dplyr::filter(model_data_final, fold_eval != 1) + +# Create embeddings +func_ind = match(model_species, colnames(func_dist)) +func_dist = func_dist[func_ind, func_ind] +func_embeddings = eigen(func_dist)$vectors[,1:20] + +phylo_ind = match(model_species, colnames(phylo_dist)) +phylo_dist = phylo_dist[phylo_ind, phylo_ind] +phylo_embeddings = eigen(phylo_dist)$vectors[,1:20] + +range_ind = match(model_species, colnames(range_dist)) +range_dist = range_dist[range_ind, range_ind] +range_embeddings = eigen(range_dist)$vectors[,1:20] + + +# ----------------------------------------------------------------------# +# Train model #### +# ----------------------------------------------------------------------# +# Define predictors +predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") + +formula = as.formula( + paste0("present ~ ", + paste(predictors, collapse = '+'), + " + e(species_int, weights = func_embeddings, lambda = 0.00001, train = T)", + " + e(species_int, weights = phylo_embeddings, lambda = 0.00001, train = T)", + " + e(species_int, weights = range_embeddings, lambda = 0.00001, train = T)" + ) +) + +plot(1, type="n", xlab="", ylab="", xlim=c(0, 25000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there +msdm_fit_embedding_multi_nolonlat = dnn( + formula, + data = train_data, + hidden = c(400L, 400L, 400L), + loss = "binomial", + activation = c("sigmoid", "leaky_relu", "leaky_relu"), + epochs = 30000L, + lr = 0.01, + baseloss = 1, + batchsize = nrow(train_data), + dropout = 0.1, + burnin = 100, + optimizer = config_optimizer("adam", weight_decay = 0.001), + lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 150, factor = 0.7), + early_stopping = 250, + validation = 0.3, + device = "cuda", +) +save(msdm_fit_embedding_multi_nolonlat, file = "data/r_objects/msdm_fit_embedding_multi_nolonlat.RData") + +# ----------------------------------------------------------------------# +# Evaluate results #### +# ----------------------------------------------------------------------# +load("data/r_objects/msdm_fit_embedding_multi_nolonlat.RData") +data_split = test_data %>% + group_by(species_int) %>% + group_split() + +msdm_results_embedding_multi_nolonlat = lapply(data_split, function(data_spec){ + target_species = data_spec$species[1] + data_spec = dplyr::select(data_spec, -species) + + msdm_performance = tryCatch({ + evaluate_model(msdm_fit_embedding_multi_nolonlat, data_spec) + }, error = function(e){ + list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) + }) + + performance_summary = tibble( + species = !!target_species, + obs = length(which(model_data$species == target_species)), + model = "MSDM_embed_informed_multi_nolonlat", + auc = msdm_performance$AUC, + accuracy = msdm_performance$Accuracy, + kappa = msdm_performance$Kappa, + precision = msdm_performance$Precision, + recall = msdm_performance$Recall, + f1 = msdm_performance$F1 + ) +}) %>% bind_rows() + +save(msdm_results_embedding_multi_nolonlat, file = "data/r_objects/msdm_results_embedding_multi_nolonlat.RData") diff --git a/R/05_01_performance_analysis.qmd b/R/05_01_performance_analysis.qmd index 44b9b8e..17be731 100644 --- a/R/05_01_performance_analysis.qmd +++ b/R/05_01_performance_analysis.qmd @@ -11,12 +11,10 @@ library(sf) library(plotly) library(DT) -load("../data/r_objects/nested_cv/model_data.RData") -load("../data/r_objects/nested_cv/ssdm_results.RData") -#load("../data/r_objects/msdm_fit.RData") -#load("../data/r_objects/msdm_results.RData") -#load("../data/r_objects/msdm_results_embedding_trained.RData") -#load("../data/r_objects/msdm_results_multiclass.RData.") +load("../data/r_objects/model_data_pa_sampling.RData") +load("../data/r_objects/ssdm_results.RData") +load("../data/r_objects/msdm_results_embedding_raw.RData") + load("../data/r_objects/range_maps.RData") load("../data/r_objects/range_maps_gridded.RData") load("../data/r_objects/occs_final.RData") @@ -25,6 +23,8 @@ sf::sf_use_s2(use_s2 = FALSE) ``` ```{r globals, echo = FALSE, include = FALSE} + + # Count occs per species obs_count = model_data %>% sf::st_drop_geometry() %>% @@ -52,8 +52,19 @@ lin_regression = function(x, y, family = "binomial"){ ) } +msdm_results = msdm_results_embedding_raw %>% + pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value") %>% + dplyr::select(-obs) %>% + dplyr::mutate( + fold_eval = 1 + ) %>% + drop_na() + # Performance table -performance = bind_rows(ssdm_results) %>% +performance = ssdm_results %>% + dplyr::select(-obs) %>% + dplyr::filter(fold_eval == 1, species %in% msdm_results$species) %>% # Only look at first fold + bind_rows(msdm_results) %>% ungroup() %>% dplyr::mutate( value = case_when( @@ -66,93 +77,94 @@ performance = bind_rows(ssdm_results) %>% focal_metrics = unique(performance$metric) ``` -## *Summary* +## Summary -*This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest.* +This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest. -*Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling).* +Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling). -### *Modeling overview:* +### Modeling overview: -#### *General decisions* +#### General decisions -- *Randomly sampled pseudo-absences from expanded area of extent of occurrence records (×1.25)* -- *Balanced presences and absences for each species* -- *Predictors: all 19 CHELSA bioclim variables* -- *70/30 Split of training vs. test data (except for NN models)* +- Randomly sampled pseudo-absences from expanded area of extent of occurrence records (×1.25) +- Balanced presences and absences for each species +- Predictors: all 19 CHELSA bioclim variables +- 70/30 Split of training vs. test data (except for NN models) -#### *sSDM Algorithms* +#### sSDM Algorithms -*Random Forest (**SSDM_RF**)* +Random Forest (**SSDM_RF**) -- *Hyperparameter tuning of `mtry`* -- *Spatial block cross-validation during training* +- Hyperparameter tuning of `mtry` +- Spatial block cross-validation during training -*Generalized boosted machine (**SSDM_GBM**)* +Generalized boosted machine (**SSDM_GBM**) -- *Hyperparameter tuning across `n.trees` , `interaction.depth` , `shrinkage`, `n.minobsinnode`* -- *Spatial block cross-validation during training* +- Hyperparameter tuning across `n.trees` , `interaction.depth` , `shrinkage`, `n.minobsinnode` +- Spatial block cross-validation during training -*Generalized Linear Model (**SSDM_GLM**)* +Generalized Linear Model (**SSDM_GLM**) -- *Logistic model with binomial link function* -- *Spatial block cross-validation during training* +- Logistic model with binomial link function +- Spatial block cross-validation during training -*Neural Netwok (**SSDM_NN**)* +Neural Netwok (**SSDM_NN**) -- *Three hidden layers, leaky ReLu activations, binomial loss* -- *no spatial block cross-validation during training* +- Three hidden layers, leaky ReLu activations, binomial loss +- no spatial block cross-validation during training -#### *mSDM Algorithms* +#### mSDM Algorithms -*Binary Neural Network with species embedding (**MSDM_embed**)* +Binary Neural Network with species embedding (**MSDM_embed**) -- *definition: presence \~ environment + embedding(species)* -- *prediction: probability of occurrence given a set of (environmental) inputs and species identity* -- *embedding initialized at random* -- *three hidden layers, sigmoid + leaky ReLu activations, binomial loss* +- definition: presence \~ environment + embedding(species) +- prediction: probability of occurrence given a set of (environmental) inputs and species identity +- embedding initialized at random +- three hidden layers, sigmoid + leaky ReLu activations, binomial loss -*Binary Neural Network with trait-informed species embedding (**MSDM_embed_informed_trained**)* +Binary Neural Network with trait-informed species embedding (**MSDM_embed_informed_trained**) -- *definition: presence \~ environment + embedding(species)* -- *prediction: probability of occurrence given a set of (environmental) inputs and species identity* -- *embedding initialized using eigenvectors of functional distance matrix, then further training on data* -- *three hidden layers, sigmoid + leaky ReLu activations, binomial loss* +- definition: presence \~ environment + embedding(species) +- prediction: probability of occurrence given a set of (environmental) inputs and species identity +- embedding initialized using eigenvectors of functional distance matrix, then further training on data +- three hidden layers, sigmoid + leaky ReLu activations, binomial loss -*Multi-Class Neural Network (**MSDM_multiclass**)* +Multi-Class Neural Network (**MSDM_multiclass**) -- *definition: species identity \~ environment* -- *prediction: probability distribution across all observed species given a set of (environmental) inputs* -- *presence-only data in training* -- *three hidden layers, leaky ReLu activations, softmax loss* -- *Top-k based evaluation (k=10, P/A \~ target species in / not among top 10 predictions)* +- definition: species identity \~ environment +- prediction: probability distribution across all observed species given a set of (environmental) inputs +- presence-only data in training +- three hidden layers, leaky ReLu activations, softmax loss +- Top-k based evaluation (k=10, P/A \~ target species in / not among top 10 predictions) -### *Key findings:* +### Key findings: -- *sSDM algorithms (RF, GBM) outperformed mSDMs in most cases* -- *mSDMs showed indications of better performance for rare species (\< 10-20 occurrences)* -- *More occurrence records and larger range sizes tended to improve model performance* -- *Higher range coverage correlated with better performance* -- *Range coverage bias and functional group showed some impact but were less consistent* -- *Convergence problems hampered NN sSDM performance* +- sSDM algorithms (RF, GBM) outperformed mSDMs in most cases +- mSDMs showed indications of better performance for rare species (\< 10-20 occurrences) +- More occurrence records and larger range sizes tended to improve model performance +- Higher range coverage correlated with better performance +- Range coverage bias and functional group showed some impact but were less consistent +- Convergence problems hampered NN sSDM performance -## *Analysis* +## Analysis -*The table below shows the analysed modeling results.* +The table below shows the analysed modeling results. ```{r performance, echo = FALSE, message=FALSE, warnings=FALSE} DT::datatable(performance) %>% formatRound(columns="value", digits=3) ``` -### *Number of records* +### Number of records -- *Model performance was generally better for species with more observations* -- *Very poor performance below 50-100 observations* +- Model performance was generally better for species with more observations +- Very poor performance below 50-100 observations ```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE} plot_performance_over_frequency = function(df_plot, metric) { - df_plot = dplyr::filter(df_plot, metric == !!metric) + + df_plot = dplyr::filter(df_plot, metric == !!metric) # Calculate regression lines for each model and metric combination suppressWarnings({ @@ -208,46 +220,47 @@ plot_performance_over_frequency = function(df_plot, metric) { return(plot) } -df_plot = performance %>% dplyr::left_join(obs_count, by = "species") +df_plot = performance %>% dplyr::left_join(obs_count, by = "species") + ``` ::: panel-tabset -#### *AUC* +#### AUC ```{r echo = FALSE} plot = plot_performance_over_frequency(df_plot, metric = "auc") bslib::card(plot, full_screen = T) ``` -#### *F1* +#### F1 ```{r echo = FALSE} plot = plot_performance_over_frequency(df_plot, metric = "f1") bslib::card(plot, full_screen = T) ``` -#### *Cohen's kappa* +#### Cohen's kappa ```{r echo = FALSE} plot = plot_performance_over_frequency(df_plot, metric = "kappa") bslib::card(plot, full_screen = T) ``` -#### *Accurracy* +#### Accurracy ```{r echo = FALSE} plot = plot_performance_over_frequency(df_plot, metric = "accuracy") bslib::card(plot, full_screen = T) ``` -#### *Precision* +#### Precision ```{r echo = FALSE} plot = plot_performance_over_frequency(df_plot, metric = "precision") bslib::card(plot, full_screen = T) ``` -#### *Recall* +#### Recall ```{r echo = FALSE} plot = plot_performance_over_frequency(df_plot, metric = "recall") @@ -255,16 +268,14 @@ bslib::card(plot, full_screen = T) ``` ::: +### Range characteristics +#### Range size -### *Range characteristics* - -#### *Range size* - -*Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016).* +Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016). -- *Model performance tended to be slightly higher for species with larger range size* -- *Only RF shows continuous performance improvements beyond range sizes of \~5M km²* +- Model performance tended to be slightly higher for species with larger range size +- Only RF shows continuous performance improvements beyond range sizes of \~5M km² ```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE, eval=F} @@ -349,15 +360,15 @@ for (model_name in unique(df_plot$model)) { bslib::card(plot, full_screen = T) ``` -#### *Range coverage* +#### Range coverage -*Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells.* +Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells. $$ RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}} $$ -- *Models for species with higher range coverage showed slightly better performance* +- Models for species with higher range coverage showed slightly better performance ```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE, eval=F} df_cells_total = range_maps_gridded %>% @@ -455,17 +466,17 @@ for (model_name in unique(df_plot$model)) { bslib::card(plot, full_screen = T) ``` -#### *Range coverage bias* +#### Range coverage bias -*Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range.* +Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range. $$ RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)} $$ -*Higher bias values indicate that occurrence records are spatially more clustered within the range of the species.* +Higher bias values indicate that occurrence records are spatially more clustered within the range of the species. -- *There was no strong relationship between range coverage bias and model performance* +- There was no strong relationship between range coverage bias and model performance ```{r range_coverage_bias, echo = FALSE, message=FALSE, warnings=FALSE, eval=F} df_occs_total = occs_final %>% @@ -556,18 +567,18 @@ for (model_name in unique(df_plot$model)) { bslib::card(plot, full_screen = T) ``` -### *Functional group* +### Functional group -*Functional groups were assigned based on taxonomic order. The following groupings were used:* +Functional groups were assigned based on taxonomic order. The following groupings were used: -| *Functional group* | *Taxomic orders* | -|-------------------|-----------------------------------------------------| -| *large ground-dwelling* | *Carnivora, Artiodactyla, Cingulata, Perissodactyla* | -| *small ground-dwelling* | *Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha* | -| *arboreal* | *Primates, Pilosa* | -| *flying* | *Chiroptera* | +| Functional group | Taxomic orders | +|--------------------|----------------------------------------------------| +| large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla | +| small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha | +| arboreal | Primates, Pilosa | +| flying | Chiroptera | -- *Models for bats tended to perform slightly worse than for other groups.* +- Models for bats tended to perform slightly worse than for other groups. ```{r functional_groups, echo = FALSE, message=FALSE, warnings=FALSE, eval=F} df_plot = performance %>% diff --git a/R/05_01_performance_analysis_carsten.qmd b/R/05_01_performance_analysis_carsten.qmd new file mode 100644 index 0000000..7dee2c4 --- /dev/null +++ b/R/05_01_performance_analysis_carsten.qmd @@ -0,0 +1,190 @@ +--- +title: "SDM Performance analysis" +format: html +editor: visual +engine: knitr +--- + +```{r init, echo = FALSE, include = FALSE} +library(tidyverse) +library(sf) +library(plotly) +library(DT) + +load("../data/r_objects/model_data_pa_sampling.RData") +load("../data/r_objects/ssdm_results.RData") +load("../data/r_objects/msdm_results_embedding_raw.RData") + +load("../data/r_objects/range_maps.RData") +load("../data/r_objects/range_maps_gridded.RData") +load("../data/r_objects/occs_final.RData") +load("../data/r_objects/functional_groups.RData") +sf::sf_use_s2(use_s2 = FALSE) +``` + +```{r globals, echo = FALSE, include = FALSE} + + +# Count occs per species +obs_count = model_data %>% + sf::st_drop_geometry() %>% + dplyr::filter(present == 1) %>% + dplyr::group_by(species) %>% + dplyr::summarise(obs = n()) + + +# Regression functions +asym_regression = function(x, y){ + nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) + new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) + data.frame( + x = new_x, + fit = predict(nls_fit, newdata = data.frame(x = new_x)) + ) +} + +lin_regression = function(x, y, family = "binomial"){ + glm_fit = suppressWarnings(glm(y~x, family = family)) + new_x = seq(min(x), max(x), length.out = 100) + data.frame( + x = new_x, + fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") + ) +} + +msdm_results = msdm_results_embedding_raw %>% + pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value") %>% + dplyr::select(-obs) %>% + dplyr::mutate( + fold_eval = 1 + ) %>% + drop_na() + +# Performance table +performance = ssdm_results %>% + dplyr::select(-obs) %>% + dplyr::filter(fold_eval == 1, species %in% msdm_results$species) %>% # Only look at first fold + bind_rows(msdm_results) %>% + ungroup() %>% + dplyr::mutate( + value = case_when( + ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accurracy", "precision", "recall")) ~ 0.5, + ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0, + .default = value + ) + ) + +focal_metrics = unique(performance$metric) +``` + + +## Analysis + +### Number of records + +```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE} +plot_performance_over_frequency = function(df_plot, metric) { + + df_plot = dplyr::filter(df_plot, metric == !!metric) + + # Calculate regression lines for each model and metric combination + suppressWarnings({ + regression_lines = df_plot %>% + group_by(model) %>% + group_modify( ~ asym_regression(.x$obs, .x$value)) + }) + + # Create base plot + plot <- plot_ly() %>% + layout( + title = "Model Performance vs. Number of observations", + xaxis = list(title = "Number of observations", type = "log"), + yaxis = list(title = metric), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest' + ) + + # Points + for (model_name in unique(df_plot$model)) { + plot = plot %>% + add_markers( + data = filter(df_plot, model == model_name, metric %in% focal_metrics), + x = ~ obs, + y = ~ value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~ model, + hoverinfo = 'text', + text = ~ paste( + "Species:", species, "<br>Observations:", obs, "<br>Value:", round(value, 3) + ) + ) + } + + # Add regression lines + for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~ x, + y = ~ fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE + ) + } + + return(plot) +} + +df_plot = performance %>% dplyr::left_join(obs_count, by = "species") + +``` + +::: panel-tabset +#### AUC + +```{r echo = FALSE} +plot = plot_performance_over_frequency(df_plot, metric = "auc") +bslib::card(plot, full_screen = T) +``` + +#### F1 + +```{r echo = FALSE} +plot = plot_performance_over_frequency(df_plot, metric = "f1") +bslib::card(plot, full_screen = T) +``` + +#### Cohen's kappa + +```{r echo = FALSE} +plot = plot_performance_over_frequency(df_plot, metric = "kappa") +bslib::card(plot, full_screen = T) +``` + +#### Accurracy + +```{r echo = FALSE} +plot = plot_performance_over_frequency(df_plot, metric = "accuracy") +bslib::card(plot, full_screen = T) +``` + +#### Precision + +```{r echo = FALSE} +plot = plot_performance_over_frequency(df_plot, metric = "precision") +bslib::card(plot, full_screen = T) +``` + +#### Recall + +```{r echo = FALSE} +plot = plot_performance_over_frequency(df_plot, metric = "recall") +bslib::card(plot, full_screen = T) +``` +::: diff --git a/R/_publish.yml b/R/_publish.yml index 02920d1..c07bacd 100644 --- a/R/_publish.yml +++ b/R/_publish.yml @@ -2,3 +2,7 @@ quarto-pub: - id: 309c45c3-1515-4985-a435-9ffa1888c5e7 url: 'https://chrkoenig.quarto.pub/ssdm-performance-analysis-0a06' +- source: 05_01_performance_analysis_carsten.qmd + quarto-pub: + - id: f9cafa63-bfd4-4c00-b401-09ec137bd3ce + url: 'https://chrkoenig.quarto.pub/sdm-performance-carsten' diff --git a/R/utils.R b/R/utils.R index a7436a5..1c9dc02 100644 --- a/R/utils.R +++ b/R/utils.R @@ -40,7 +40,7 @@ evaluate_model <- function(model, data) { # Predict probabilities if(class(model) %in% c("citodnn", "citodnnBootstrap")){ - probs <- predict(model, data, type = "response")[,1] + probs <- predict(model, as.matrix(data), type = "response")[,1] preds <- factor(round(probs), levels = c("0", "1"), labels = c("A", "P")) } else { probs <- predict(model, data, type = "prob")$P diff --git a/renv.lock b/renv.lock index ff0fc5a..7f13630 100644 --- a/renv.lock +++ b/renv.lock @@ -2,6 +2,26 @@ "R": { "Version": "4.3.0", "Repositories": [ + { + "Name": "BioCsoft", + "URL": "https://bioconductor.org/packages/3.18/bioc" + }, + { + "Name": "BioCann", + "URL": "https://bioconductor.org/packages/3.18/data/annotation" + }, + { + "Name": "BioCexp", + "URL": "https://bioconductor.org/packages/3.18/data/experiment" + }, + { + "Name": "BioCworkflows", + "URL": "https://bioconductor.org/packages/3.18/workflows" + }, + { + "Name": "BioCbooks", + "URL": "https://bioconductor.org/packages/3.18/books" + }, { "Name": "CRAN", "URL": "https://cloud.r-project.org" @@ -177,6 +197,20 @@ ], "Hash": "f27411eb6d9c3dada5edd444b8416675" }, + "RcppArmadillo": { + "Package": "RcppArmadillo", + "Version": "14.2.0-1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp", + "methods", + "stats", + "utils" + ], + "Hash": "ce9c35ab9ea9c6ef4e27a08868cdb32b" + }, "RcppEigen": { "Package": "RcppEigen", "Version": "0.3.4.0.2", @@ -190,6 +224,57 @@ ], "Hash": "4ac8e423216b8b70cb9653d1b3f71eb9" }, + "RcppGSL": { + "Package": "RcppGSL", + "Version": "0.3.13", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "Rcpp", + "stats" + ], + "Hash": "e8fc7310d256a7b6c4de8e57ab76c55d" + }, + "RcppParallel": { + "Package": "RcppParallel", + "Version": "5.1.9", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "f38a72a419b91faac0ce5d9eee04c120" + }, + "RcppZiggurat": { + "Package": "RcppZiggurat", + "Version": "0.1.6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp", + "RcppGSL", + "graphics", + "parallel", + "stats", + "utils" + ], + "Hash": "75b4a36aeeed440ad03b996081190703" + }, + "Rfast": { + "Package": "Rfast", + "Version": "2.1.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp", + "RcppArmadillo", + "RcppParallel", + "RcppZiggurat" + ], + "Hash": "79e8394e1fa06a4ae954b70ca9b16e8f" + }, "SQUAREM": { "Package": "SQUAREM", "Version": "2021.1", @@ -832,6 +917,21 @@ ], "Hash": "33698c4b3127fc9f506654607fb73676" }, + "dismo": { + "Package": "dismo", + "Version": "1.3-16", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp", + "methods", + "raster", + "sp", + "terra" + ], + "Hash": "5c8646b40ba69146afc070aaea9f893c" + }, "doParallel": { "Package": "doParallel", "Version": "1.0.17", @@ -1139,6 +1239,17 @@ ], "Hash": "15e9634c0fcd294799e9b2e929ed1b86" }, + "geos": { + "Package": "geos", + "Version": "0.2.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "libgeos", + "wk" + ], + "Hash": "117a3a09b793abf1d2146027a71b7524" + }, "geosphere": { "Package": "geosphere", "Version": "1.5-18", @@ -1623,6 +1734,13 @@ ], "Hash": "d908914ae53b04d4c0c0fd72ecc35370" }, + "libgeos": { + "Package": "libgeos", + "Version": "3.11.1-2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "323d0f39c2e5ebcb152b810d1e8ed9bb" + }, "lifecycle": { "Package": "lifecycle", "Version": "1.0.4", @@ -2660,6 +2778,18 @@ ], "Hash": "75940133cca2e339afce15a586f85b11" }, + "spatialEco": { + "Package": "spatialEco", + "Version": "2.0-2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "sf", + "terra" + ], + "Hash": "d0352030add66f0c73ff5d7473b7aef1" + }, "stringdist": { "Package": "stringdist", "Version": "0.9.12", @@ -2847,6 +2977,27 @@ ], "Hash": "829f27b9c4919c16b593794a6344d6c0" }, + "tidyterra": { + "Package": "tidyterra", + "Version": "0.6.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "data.table", + "dplyr", + "ggplot2", + "magrittr", + "rlang", + "scales", + "sf", + "terra", + "tibble", + "tidyr" + ], + "Hash": "5d9672bc78297d33c4ad8b51a8aaa48f" + }, "tidyverse": { "Package": "tidyverse", "Version": "2.0.0", -- GitLab