From c1fa2153d747110118b67cc2825de6d8bb7648fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Wed, 19 Mar 2025 09:37:54 +0100 Subject: [PATCH] Implement glock block CV strategy --- R/03_02_absence_preparation.R | 55 +- R/03_03_model_data_finalization.R | 79 ++ R/04_01_modelling_ssdm.R | 106 ++- R/04_02_modelling_msdm_embed.R | 64 +- R/04_03_modelling_msdm_onehot.R | 64 +- ...ng_msdm_rf.R => 04_04_modelling_msdm_rf.R} | 94 +-- R/04_06_rf_testing.R | 144 ---- R/05_01_performance_report.qmd | 16 +- R/05_01_performance_report_rf.qmd | 692 ++++++++++++++++++ R/05_02_publication_analysis.R | 61 +- R/_publish.yml | 6 + 11 files changed, 979 insertions(+), 402 deletions(-) create mode 100644 R/03_03_model_data_finalization.R rename R/{04_05_modelling_msdm_rf.R => 04_04_modelling_msdm_rf.R} (56%) delete mode 100644 R/04_06_rf_testing.R create mode 100644 R/05_01_performance_report_rf.qmd diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R index d7ef767..1b9527b 100644 --- a/R/03_02_absence_preparation.R +++ b/R/03_02_absence_preparation.R @@ -1,20 +1,13 @@ # General packages library(dplyr) library(tidyr) -library(furrr) +library(ggplot2) # Geo packages library(terra) -library(CoordinateCleaner) library(sf) library(geos) - -# Stat packages -library(MASS) -library(dismo) -library(spatialEco) library(blockCV) -library(caret) source("R/utils.R") @@ -137,22 +130,9 @@ for(species in setdiff(target_species, species_processed)){ ) %>% bind_rows(abs_spec) - ref = st_bbox(sampling_region)[c(1,3,2,4)] - ggplot() + - ggtitle(species) + - geom_sf(data = sa_polygon) + - geom_sf(data = range_spec, alpha = 0, color = "red") + - geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) + - scale_color_discrete(name = "Presence") + - theme_minimal() - - try({ - ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", - width = 8, height = 12) - }) - + # Define cross-validation folds - pa_spec_core = dplyr::filter(pa_spec, record_type != "background") + pa_spec_core = dplyr::filter(pa_spec, record_type == "core") pa_spec_background = dplyr::filter(pa_spec, record_type == "background") folds = tryCatch({ @@ -171,21 +151,28 @@ for(species in setdiff(target_species, species_processed)){ }) pa_spec = pa_spec_core %>% - dplyr::mutate(fold_eval = folds) %>% + dplyr::mutate(fold_individual = folds) %>% bind_rows(pa_spec_background) %>% group_by(present) %>% mutate(weight = 1/n()) %>% ungroup() + # Plot result + ref = st_bbox(sampling_region)[c(1,3,2,4)] + ggplot() + + ggtitle(species) + + geom_sf(data = sa_polygon) + + geom_sf(data = range_spec, aes(fill = "IUCN range"), alpha = 0.1, color = NA, show.legend = "fill") + + geom_sf(data = sampling_region, aes(fill = "Sampling region"), alpha = 0.1, color = NA, show.legend = "fill") + + geom_sf(data = pa_spec, aes(color = as.factor(present), shape = as.factor(record_type)), size = 0.7) + + scale_fill_manual(name = "Polygons", values = c("IUCN range" = "red", "Sampling region" = "blue")) + + scale_color_discrete(name = "Presence") + + scale_shape_discrete(name = "Record type") + + theme_minimal() + + try({ + ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 3) + }) + save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData")) } - -# Combine presence-absence data -files = list.files("data/r_objects/pa_sampling/", full.names = T) -model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% - bind_rows() %>% - group_by(species) %>% - dplyr::filter(all(1:5 %in% fold_eval)) %>% - ungroup - -save(model_data, file = "data/r_objects/model_data.RData") diff --git a/R/03_03_model_data_finalization.R b/R/03_03_model_data_finalization.R new file mode 100644 index 0000000..8631c4d --- /dev/null +++ b/R/03_03_model_data_finalization.R @@ -0,0 +1,79 @@ +library(dplyr) +library(tidyr) +library(ggplot2) +library(gridExtra) +library(sf) +library(blockCV) + +source("R/utils.R") + +# Combine presence-absence data +files = list.files("data/r_objects/pa_sampling/", full.names = T) + +model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% + bind_rows() + +model_data_core = dplyr::filter(model_data, record_type == "core") +model_data_background = dplyr::filter(model_data, record_type == "background") + +# Define global folds +spatial_folds = suppressMessages( + blockCV::cv_spatial( + model_data_core, + column = "present", + k = 5 + ) +) + +model_data_core$fold_global = spatial_folds$folds_ids +model_data = bind_rows(model_data_core, model_data_background) %>% + arrange(species) %>% + relocate(contains("fold"), .after = last_col()) %>% + mutate(species = as.factor(species)) + +# Explore folds assignment +cv_plot(spatial_folds) +ggsave(filename = "plots/publication/global_folds_map.pdf", device = "pdf", scale = 2.5) + +folds_summary = model_data %>% + dplyr::filter(record_type == "core") %>% + sf::st_drop_geometry() %>% + dplyr::group_by(species, fold_global) %>% + dplyr::summarise( + n_occ = length(which(present == 1)), + n_abs = length(which(present == 0)) + ) + +folds_uniform = folds_summary %>% + dplyr::group_by(species, fold_global) %>% + dplyr::summarise( + is_uniform = n_occ == 0 | n_abs == 0 + ) %>% + dplyr::group_by(species) %>% + dplyr::summarise( + folds_total = n(), + folds_uniform = length(which(is_uniform)), + folds_mixed = folds_total - folds_uniform + ) %>% + dplyr::ungroup() + +plot_1 = ggplot(folds_uniform, aes(x=folds_total)) + + geom_histogram(binwidth=0.5, fill = "orange") + + labs(title="Number of folds per species", x="Count", y = "Number of species") + + theme_minimal() +plot_2 = ggplot(folds_uniform, aes(x=folds_uniform)) + + geom_histogram(binwidth=0.5, fill = "violet") + + labs(title="Number of uniform folds per species", x="Count", y = "Number of species", + subtitle="Uniform folds contain only absences or only presences and cannot be used in model evaluation.") + + theme_minimal() +plot_3 = ggplot(folds_uniform, aes(x=folds_mixed)) + + geom_histogram(binwidth=0.5, fill = "blue") + + labs(title="Number of mixed folds per species", x="Count", y = "Number of species", + subtitle="Mixed folds both absences and presences and can be used in model evaluation.") + + theme_minimal() + +plot_combined = grid.arrange(plot_1, plot_2, plot_3, ncol = 1) +ggsave(filename = "plots/publication/global_folds_histogram.pdf", plot_combined, device = "pdf", height = 10, width = 7) + +# Save Model data +save(model_data, file = "data/r_objects/model_data.RData") \ No newline at end of file diff --git a/R/04_01_modelling_ssdm.R b/R/04_01_modelling_ssdm.R index bc1346c..77c25ec 100644 --- a/R/04_01_modelling_ssdm.R +++ b/R/04_01_modelling_ssdm.R @@ -4,82 +4,67 @@ library(sf) library(caret) library(cito) library(pROC) +library(parallel) +library(doParallel) +library(foreach) source("R/utils.R") load("data/r_objects/model_data.RData") -# ----------------------------------------------------------------------# -# Run training #### -# ----------------------------------------------------------------------# species_processed = list.files("data/r_objects/ssdm_results/performance/", pattern = "RData") %>% stringr::str_remove(".RData") -data_split = model_data %>% - dplyr::filter(!is.na(fold_eval) & !species %in% species_processed) %>% +data_split = model_data %>% + sf::st_drop_geometry() %>% + dplyr::filter(!species %in% species_processed) %>% dplyr::group_by(species) %>% dplyr::group_split() -for(pa_spec in data_split){ +# ----------------------------------------------------------------------# +# Set up parallel processing #### +# ----------------------------------------------------------------------# +num_cores <- 2 +cl <- makeCluster(num_cores) +registerDoParallel(cl) + +# ----------------------------------------------------------------------# +# Run training #### +# ----------------------------------------------------------------------# +# Use foreach for parallel processing across species +foreach(pa_spec = data_split, .packages = c("dplyr", "caret", "cito", "pROC", "tidyr")) %dopar% { + na_performance = list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_, + precision = NA_real_, recall = NA_real_, f1 = NA_real_, + tp = NA_real_, fp = NA_real_, tn = NA_real_, fn = NA_real_) + + predictors = paste0("bio", 1:19) + species = pa_spec$species[1] print(species) - # Define empty result for performance eval - na_performance = list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, - Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, - TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_) - - # Create factor presence column pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P")) + folds = sort(unique(na.omit(pa_spec$fold_global))) # Outer CV loop (for averaging performance metrics) - performance_cv = lapply(1:5, function(fold){ + performance_cv = lapply(folds, function(fold){ print(paste("Fold", fold)) ## Preparations ##### - data_test = dplyr::filter(pa_spec, fold_eval == fold) - data_train = dplyr::filter(pa_spec, is.na(fold_eval) | fold_eval != fold) # include background samples + data_train = dplyr::filter(pa_spec, record_type == "background" | fold_global != fold) # include background samples + data_test = dplyr::filter(pa_spec, fold_global == fold) - # Create inner CV folds for model training - folds_train = tryCatch({ - cv_train = blockCV::cv_spatial( - data_train, - column = "present", - k = 5, - progress = F, plot = F, report = F - ) - - cv_train$folds_ids - }, error = function(e){ - NA - }) - - if(is.numeric(folds_train)){ - data_train$fold_train = folds_train - } else { - return() + if(all(data_train$present == 0)){ # If only absences in training dataset --> return + return(NULL) } - # Drop geometry - data_train$geometry = NULL - data_test$geometry = NULL - - # Define caret training routine - index_train = lapply(unique(sort(data_train$fold_train)), function(x){ - return(which(data_train$fold_train != x)) - }) - train_ctrl = caret::trainControl( search = "random", classProbs = TRUE, - index = index_train, + number = 5, summaryFunction = caret::twoClassSummary, savePredictions = "final" ) - # Define predictors - predictors = paste0("bio", 1:19) - ## Random Forest ##### rf_performance = tryCatch({ # Fit model @@ -87,9 +72,9 @@ for(pa_spec in data_split){ x = data_train[, predictors], y = data_train$present_fct, method = "ranger", - metric = "Accuracy", trControl = train_ctrl, tuneLength = 8, + weights = data_train$weight, verbose = F ) @@ -106,6 +91,7 @@ for(pa_spec in data_split){ method = "gbm", trControl = train_ctrl, tuneLength = 8, + weights = data_train$weight, verbose = F ) evaluate_model(gbm_fit, data_test) @@ -120,6 +106,7 @@ for(pa_spec in data_split){ y = data_train$present_fct, method = "gamSpline", tuneLength = 8, + weights = data_train$weight, trControl = train_ctrl ) evaluate_model(gam_fit, data_test) @@ -137,7 +124,7 @@ for(pa_spec in data_split){ loss = "binomial", epochs = 200L, burnin = 200L, - early_stopping = 30, + early_stopping = 25, lr = 0.001, batchsize = min(ceiling(nrow(data_train)/10), 64), dropout = 0.25, @@ -157,7 +144,7 @@ for(pa_spec in data_split){ performance_summary = tibble( species = !!species, obs = nrow(data_train), - fold_eval = fold, + fold_global = fold, model = c("rf", "gbm", "gam", "nn"), auc = c(rf_performance$auc, gbm_performance$auc, gam_performance$auc, nn_performance$auc), accuracy = c(rf_performance$accuracy, gbm_performance$accuracy, gam_performance$accuracy, nn_performance$accuracy), @@ -170,7 +157,7 @@ for(pa_spec in data_split){ tn = c(rf_performance$tn, gbm_performance$tn, gam_performance$tn, nn_performance$tn), fn = c(rf_performance$fn, gbm_performance$fn, gam_performance$fn, nn_performance$fn) ) %>% - tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") + tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value") return(performance_summary) }) @@ -179,12 +166,15 @@ for(pa_spec in data_split){ performance_spec = bind_rows(performance_cv) if(nrow(performance_spec) != 0){ performance_spec = performance_spec %>% - dplyr::arrange(fold_eval, model, metric) + dplyr::arrange(fold_global, model, metric) save(performance_spec, file = paste0("data/r_objects/ssdm_results/performance/", species, ".RData")) } } +# Stop the parallel cluster when done +stopCluster(cl) + # Combine results files = list.files("data/r_objects/ssdm_results/performance/", full.names = T, pattern = ".RData") ssdm_performance = lapply(files, function(f){load(f); return(performance_spec)}) %>% @@ -196,7 +186,6 @@ save(ssdm_performance, file = "data/r_objects/ssdm_performance.RData") # Train full models #### # ----------------------------------------------------------------------# data_split = model_data %>% - dplyr::filter(!is.na(fold_eval)) %>% dplyr::mutate(present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))) %>% dplyr::group_by(species) %>% dplyr::group_split() @@ -230,7 +219,7 @@ for(pa_spec in data_split){ summaryFunction = caret::twoClassSummary, savePredictions = "final" ) - + # Define predictors predictors = paste0("bio", 1:19) @@ -240,12 +229,11 @@ for(pa_spec in data_split){ x = pa_spec[, predictors], y = pa_spec$present_fct, method = "ranger", - metric = "Accuracy", trControl = train_ctrl, tuneLength = 8, + weights = data_train$weight, verbose = F ) - save(rf_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_rf_fit.RData")) }) @@ -258,6 +246,7 @@ for(pa_spec in data_split){ method = "gbm", trControl = train_ctrl, tuneLength = 8, + weights = data_train$weight, verbose = F ) save(gbm_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gbm_fit.RData")) @@ -269,8 +258,9 @@ for(pa_spec in data_split){ x = pa_spec[, predictors], y = pa_spec$present_fct, method = "gamSpline", - trControl = train_ctrl, tuneLength = 8, + weights = data_train$weight, + trControl = train_ctrl ) save(gam_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gam_fit.RData")) }) @@ -285,9 +275,9 @@ for(pa_spec in data_split){ loss = "binomial", epochs = 200L, burnin = 200L, - early_stopping = 30, + early_stopping = 25, lr = 0.001, - batchsize = min(ceiling(nrow(pa_spec)/10), 64), + batchsize = min(ceiling(nrow(data_train)/10), 64), dropout = 0.25, optimizer = config_optimizer("adam"), validation = 0.2, diff --git a/R/04_02_modelling_msdm_embed.R b/R/04_02_modelling_msdm_embed.R index 48c4e9c..9814cb5 100644 --- a/R/04_02_modelling_msdm_embed.R +++ b/R/04_02_modelling_msdm_embed.R @@ -7,7 +7,6 @@ source("R/utils.R") load("data/r_objects/model_data.RData") model_data = model_data %>% - dplyr::mutate(species = as.factor(species)) %>% sf::st_drop_geometry() # ----------------------------------------------------------------------# @@ -19,31 +18,25 @@ formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + # 1. Cross validation for(fold in 1:5){ # Prepare data - train_data = dplyr::filter(model_data, fold_eval != fold) + data_train = model_data %>% + dplyr::filter(record_type == "background" | fold_global != fold) # Run model - converged = F - while(!converged){ - msdm_embed_fit = dnn( - formula, - data = train_data, - hidden = c(200L, 200L, 200L), - loss = "binomial", - epochs = 5000, - lr = 0.001, - batchsize = 4096, - dropout = 0.25, - burnin = 50, - optimizer = config_optimizer("adam"), - early_stopping = 200, - validation = 0.2, - device = "cuda" - ) - - if(min(msdm_embed_fit$losses$valid_l, na.rm = T) < 0.4){ - converged = T - } - } + msdm_embed_fit = dnn( + formula, + data = data_train, + hidden = c(200L, 200L, 200L), + loss = "binomial", + epochs = 5000, + lr = 0.001, + batchsize = 1024, + dropout = 0.25, + burnin = 50, + optimizer = config_optimizer("adam"), + early_stopping = 100, + validation = 0.2, + device = "cuda" + ) save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold,".RData")) } @@ -56,12 +49,11 @@ msdm_embed_fit = dnn( loss = "binomial", epochs = 5000, lr = 0.001, - baseloss = 1, - batchsize = 4096, + batchsize = 1024, dropout = 0.25, - burnin = 500, + burnin = 50, optimizer = config_optimizer("adam"), - early_stopping = 300, + early_stopping = 100, validation = 0.2, device = "cuda" ) @@ -74,15 +66,15 @@ save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed msdm_embed_performance = lapply(1:5, function(fold){ load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData")) - test_data_split = model_data %>% - dplyr::filter(fold_eval == fold) %>% + data_test_split = model_data %>% + dplyr::filter(fold_global == fold) %>% dplyr::group_split(species) - lapply(test_data_split, function(test_data_spec){ - species = test_data_spec$species[1] + lapply(data_test_split, function(data_test_spec){ + species = data_test_spec$species[1] performance = tryCatch({ - evaluate_model(msdm_embed_fit, test_data_spec) + evaluate_model(msdm_embed_fit, data_test_spec) }, error = function(e){ list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_, precision = NA_real_, recall = NA_real_, f1 = NA_real_, @@ -93,11 +85,11 @@ msdm_embed_performance = lapply(1:5, function(fold){ as_tibble() %>% mutate( species = !!species, - obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), - fold_eval = !!fold, + obs = nrow(dplyr::filter(model_data, species == !!species, fold_global != !!fold)), + fold_global = !!fold, model = "msdm_embed", ) %>% - tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") + tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value") }) %>% bind_rows() }) %>% diff --git a/R/04_03_modelling_msdm_onehot.R b/R/04_03_modelling_msdm_onehot.R index f5e7ee3..114b606 100644 --- a/R/04_03_modelling_msdm_onehot.R +++ b/R/04_03_modelling_msdm_onehot.R @@ -7,7 +7,6 @@ source("R/utils.R") load("data/r_objects/model_data.RData") model_data = model_data %>% - dplyr::mutate(species = as.factor(species)) %>% sf::st_drop_geometry() # ----------------------------------------------------------------------# @@ -19,31 +18,25 @@ formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + # 1. Cross validation for(fold in 1:5){ # Prepare data - train_data = dplyr::filter(model_data, fold_eval != fold) + data_train = model_data %>% + dplyr::filter(record_type == "background" | fold_global != fold) # Run model - converged = F - while(!converged){ - msdm_onehot_fit = dnn( - formula, - data = train_data, - hidden = c(200L, 200L, 200L), - loss = "binomial", - epochs = 5000, - lr = 0.001, - batchsize = 4096, - dropout = 0.25, - burnin = 50, - optimizer = config_optimizer("adam"), - early_stopping = 200, - validation = 0.2, - device = "cuda" - ) - - if(min(msdm_onehot_fit$losses$valid_l, na.rm = T) < 0.4){ - converged = T - } - } + msdm_onehot_fit = dnn( + formula, + data = data_train, + hidden = c(200L, 200L, 200L), + loss = "binomial", + epochs = 5000, + lr = 0.001, + batchsize = 1024, + dropout = 0.25, + burnin = 50, + optimizer = config_optimizer("adam"), + early_stopping = 100, + validation = 0.2, + device = "cuda" + ) save(msdm_onehot_fit, file = paste0("data/r_objects/msdm_onehot_results/msdm_onehot_fit_fold", fold,".RData")) } @@ -56,12 +49,11 @@ msdm_onehot_fit = dnn( loss = "binomial", epochs = 5000, lr = 0.001, - baseloss = 1, - batchsize = 4096, + batchsize = 1024, dropout = 0.25, - burnin = 500, + burnin = 50, optimizer = config_optimizer("adam"), - early_stopping = 300, + early_stopping = 100, validation = 0.2, device = "cuda" ) @@ -74,15 +66,15 @@ save(msdm_onehot_fit, file = paste0("data/r_objects/msdm_onehot_results/msdm_one msdm_onehot_performance = lapply(1:5, function(fold){ load(paste0("data/r_objects/msdm_onehot_results/msdm_onehot_fit_fold", fold, ".RData")) - test_data_split = model_data %>% - dplyr::filter(fold_eval == fold) %>% + data_test_split = model_data %>% + dplyr::filter(fold_global == fold) %>% dplyr::group_split(species) - lapply(test_data_split, function(test_data_spec){ - species = test_data_spec$species[1] + lapply(data_test_split, function(data_test_spec){ + species = data_test_spec$species[1] performance = tryCatch({ - evaluate_model(msdm_onehot_fit, test_data_spec) + evaluate_model(msdm_onehot_fit, data_test_spec) }, error = function(e){ list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_, precision = NA_real_, recall = NA_real_, f1 = NA_real_, @@ -93,11 +85,11 @@ msdm_onehot_performance = lapply(1:5, function(fold){ as_tibble() %>% mutate( species = !!species, - obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), - fold_eval = !!fold, + obs = nrow(dplyr::filter(model_data, species == !!species, fold_global != !!fold)), + fold_global = !!fold, model = "msdm_onehot", ) %>% - tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") + tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value") }) %>% bind_rows() }) %>% diff --git a/R/04_05_modelling_msdm_rf.R b/R/04_04_modelling_msdm_rf.R similarity index 56% rename from R/04_05_modelling_msdm_rf.R rename to R/04_04_modelling_msdm_rf.R index 0e349b8..4544b73 100644 --- a/R/04_05_modelling_msdm_rf.R +++ b/R/04_04_modelling_msdm_rf.R @@ -5,42 +5,35 @@ library(ranger) source("R/utils.R") -load("data/r_objects/model_data_random_abs_extended.RData") +load("data/r_objects/model_data.RData") model_data = model_data %>% - dplyr::filter(!is.na(fold_eval)) %>% dplyr::mutate( - species = as.factor(species), present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P")) - ) + ) %>% + sf::st_drop_geometry() # ----------------------------------------------------------------------# # Train model #### # ----------------------------------------------------------------------# # Define predictors -predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness", "species") +predictors = c(paste0("bio", 1:19), "species") # Cross validation for(fold in 1:5){ + print(paste("Fold", fold)) ## Preparations ##### - data_train = dplyr::filter(model_data, fold_eval != fold) %>% - sf::st_drop_geometry() + data_train = model_data %>% + dplyr::filter(record_type == "background" | fold_global != fold) - # Define caret training routine train_ctrl = caret::trainControl( - method = "cv", - number = 5, + search = "random", classProbs = TRUE, + number = 5, summaryFunction = caret::twoClassSummary, savePredictions = "final" ) - tune_grid = expand.grid( - mtry = c(2,4,6,8), - splitrule = "gini", - min.node.size = c(1,4,9,16) - ) - # Run model rf_fit = caret::train( x = data_train[, predictors], @@ -48,53 +41,46 @@ for(fold in 1:5){ method = "ranger", metric = "Accuracy", trControl = train_ctrl, - tuneGrid = tune_grid, + tuneLength = 8, weights = data_train$weight, num.threads = 48 ) - save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData")) + save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold,".RData")) } # Full model # Define caret training routine -full_data = model_data %>% - sf::st_drop_geometry() - train_ctrl = caret::trainControl( - method = "cv", - number = 5, + search = "random", classProbs = TRUE, + number = 5, summaryFunction = caret::twoClassSummary, savePredictions = "final" ) -tune_grid = expand.grid( - mtry = c(2,4,6,8), - splitrule = "gini", - min.node.size = c(1,4,9,16) -) - # Run model rf_fit = caret::train( - x = full_data[, predictors], - y = full_data$present_fct, + x = model_data[, predictors], + y = model_data$present_fct, method = "ranger", metric = "Accuracy", trControl = train_ctrl, - tuneGrid = tune_grid + tuneLength = 8, + weights = full_data$weight, + num.threads = 48 ) -save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData") +save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_full.RData") # ----------------------------------------------------------------------# # Evaluate model #### # ----------------------------------------------------------------------# -msdm_rf_random_abs_extended_performance = lapply(1:5, function(fold){ - load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData")) +msdm_rf_performance = lapply(1:5, function(fold){ + load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold, ".RData")) test_data = model_data %>% - dplyr::filter(fold_eval == fold, record_type != "absence_background") %>% + dplyr::filter(fold_global == fold) %>% sf::st_drop_geometry() actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P")) @@ -119,36 +105,36 @@ msdm_rf_random_abs_extended_performance = lapply(1:5, function(fold){ cm = caret::confusionMatrix(eval_df_spec$preds, eval_df_spec$actual, positive = "P") list( - AUC = as.numeric(auc), - Accuracy = cm$overall["Accuracy"], - Kappa = cm$overall["Kappa"], - Precision = cm$byClass["Precision"], - Recall = cm$byClass["Recall"], - F1 = cm$byClass["F1"], - TP = cm$table["P", "P"], - FP = cm$table["P", "A"], - TN = cm$table["A", "A"], - FN = cm$table["A", "P"] + auc = as.numeric(auc), + accuracy = cm$overall["Accuracy"], + kappa = cm$overall["Kappa"], + precision = cm$byClass["Precision"], + recall = cm$byClass["Recall"], + f1 = cm$byClass["F1"], + tp = cm$table["P", "P"], + fp = cm$table["P", "A"], + tn = cm$table["A", "A"], + fn = cm$table["A", "P"] ) }, error = function(e){ - list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, - Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, - TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_) + list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_, + precision = NA_real_, recall = NA_real_, f1 = NA_real_, + tp = NA_real_, fp = NA_real_, tn = NA_real_, fn = NA_real_) }) performance_summary = performance %>% as_tibble() %>% mutate( species = !!species, - obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), - fold_eval = !!fold, - model = "MSDM_rf_random_abs_extended", + obs = nrow(dplyr::filter(model_data, species == !!species, fold_global != !!fold)), + fold_global = !!fold, + model = "msdm_rf", ) %>% - tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") %>% + tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value") %>% drop_na() }) %>% bind_rows() }) %>% bind_rows() -save(msdm_rf_random_abs_extended_performance, file = paste0("data/r_objects/msdm_rf_random_abs_extended_performance.RData")) +save(msdm_rf_performance, file = paste0("data/r_objects/msdm_rf_performance.RData")) diff --git a/R/04_06_rf_testing.R b/R/04_06_rf_testing.R deleted file mode 100644 index 9057285..0000000 --- a/R/04_06_rf_testing.R +++ /dev/null @@ -1,144 +0,0 @@ -library(dplyr) -library(tidyr) -library(caret) -library(ranger) - -source("R/utils.R") - -load("data/r_objects/model_data_random_abs.RData") - -model_data = model_data %>% - dplyr::filter(!is.na(fold_eval)) %>% - dplyr::mutate( - species = as.factor(species), - present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P")) - ) - -predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness", "species") - -full_data = model_data %>% - sf::st_drop_geometry() - -train_ctrl = caret::trainControl( - search = "random", - classProbs = TRUE, - summaryFunction = caret::twoClassSummary, - savePredictions = "final" -) - -# Run model -rf_fit = caret::train( - x = full_data[, predictors], - y = full_data$present_fct, - method = "ranger", - metric = "Accuracy", - trControl = train_ctrl, - tuneLength = 1, - num.threads = 48, - importance = 'impurity', - verbose = T -) - -save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData") - -varimp = varImp(rf_fit) - -# Cross validation -for(fold in 1:5){ - ## Preparations ##### - data_train = dplyr::filter(model_data, fold_eval != fold) %>% - sf::st_drop_geometry() - - # Define caret training routine - train_ctrl = caret::trainControl( - method = "cv", - number = 5, - classProbs = TRUE, - summaryFunction = caret::twoClassSummary, - savePredictions = "final" - ) - - tune_grid = expand.grid( - mtry = c(2,4,6,8), - splitrule = "gini", - min.node.size = c(1,4,9,16) - ) - - # Run model - rf_fit = caret::train( - x = data_train[, predictors], - y = data_train$present_fct, - method = "ranger", - metric = "Accuracy", - trControl = train_ctrl, - tuneGrid = tune_grid, - num.threads = 48, - verbose = F - ) - - save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData")) -} - -# ----------------------------------------------------------------------# -# Evaluate model #### -# ----------------------------------------------------------------------# -msdm_rf_random_abs_performance = lapply(1:5, function(fold){ - load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData")) - - test_data = dplyr::filter(model_data, fold_eval == fold) %>% - sf::st_drop_geometry() - - actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P")) - probs = predict(rf_fit, test_data, type = "prob")$P - preds = predict(rf_fit, test_data, type = "raw") - - eval_dfs = data.frame( - species = test_data$species, - actual, - probs, - preds - ) %>% - group_by(species) %>% - group_split() - - - lapply(eval_dfs, function(eval_df_spec){ - species = eval_df_spec$species[1] - - performance = tryCatch({ - auc = pROC::roc(eval_df_spec$actual, eval_df_spec$probs, levels = c("P", "A"), direction = ">")$auc - cm = caret::confusionMatrix(eval_df_spec$preds, eval_df_spec$actual, positive = "P") - - list( - AUC = as.numeric(auc), - Accuracy = cm$overall["Accuracy"], - Kappa = cm$overall["Kappa"], - Precision = cm$byClass["Precision"], - Recall = cm$byClass["Recall"], - F1 = cm$byClass["F1"], - TP = cm$table["P", "P"], - FP = cm$table["P", "A"], - TN = cm$table["A", "A"], - FN = cm$table["A", "P"] - ) - }, error = function(e){ - list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, - Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, - TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_) - }) - - performance_summary = performance %>% - as_tibble() %>% - mutate( - species = !!species, - obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), - fold_eval = !!fold, - model = "MSDM_rf_random_abs", - ) %>% - tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") - }) %>% - bind_rows() -}) %>% - bind_rows() - -save(msdm_rf_random_abs_performance, file = paste0("data/r_objects/msdm_rf_random_abs_performance.RData")) diff --git a/R/05_01_performance_report.qmd b/R/05_01_performance_report.qmd index bc5a0b7..8fb626b 100644 --- a/R/05_01_performance_report.qmd +++ b/R/05_01_performance_report.qmd @@ -12,9 +12,9 @@ library(plotly) library(DT) load("../data/r_objects/model_data.RData") -load("../data/r_objects/ssdm_performance.RData") +#load("../data/r_objects/ssdm_performance.RData") load("../data/r_objects/msdm_embed_performance.RData") -load("../data/r_objects/msdm_onehot_performance.RData") +#load("../data/r_objects/msdm_onehot_performance.RData") load("../data/r_objects/msdm_rf_performance.RData") load("../data/r_objects/range_maps.RData") @@ -56,16 +56,12 @@ loess_fit = function(x, y, span = 0.75){ } # Performance table -performance = ssdm_results %>% - bind_rows(msdm_embed_performance) %>% - bind_rows(msdm_onehot_performance) %>% +performance = msdm_embed_performance %>% + #bind_rows(msdm_embed_performance) %>% + #bind_rows(msdm_onehot_performance) %>% bind_rows(msdm_rf_performance) %>% - dplyr::mutate( # TODO: remove when only working with new data - metric = stringr::str_to_lower(metric), - model = stringr::str_to_lower(model) - ) %>% dplyr::group_by(species, model, metric) %>% - dplyr::summarise(value = mean(value, na.rm = F)) %>% + dplyr::summarise(value = mean(value, na.rm = T)) %>% dplyr::mutate( value = case_when( ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5, diff --git a/R/05_01_performance_report_rf.qmd b/R/05_01_performance_report_rf.qmd new file mode 100644 index 0000000..534a5f8 --- /dev/null +++ b/R/05_01_performance_report_rf.qmd @@ -0,0 +1,692 @@ +--- +title: "SDM Performance report" +format: html +editor: visual +engine: knitr +--- + +```{r init, echo = FALSE, include = FALSE} +library(tidyverse) +library(sf) +library(plotly) +library(DT) + +load("../data/r_objects/model_data.RData") +load("../data/r_objects/range_maps.RData") +load("../data/r_objects/range_maps_gridded.RData") +load("../data/r_objects/occs_final.RData") +load("../data/r_objects/functional_groups.RData") + +sf::sf_use_s2(use_s2 = FALSE) + +# Load performance of different RF versions +load("../data/r_objects/deprecated/msdm_rf_no_species_random_abs_performance.RData") +load("../data/r_objects/deprecated/msdm_rf_no_species_performance.RData") +load("../data/r_objects/msdm_rf_performance.RData") + +msdm_rf_no_species_performance$model = "rf_noSpec_oldVars_informedAbs" +msdm_rf_no_species_performance$metric = tolower(msdm_rf_no_species_performance$metric) +msdm_rf_no_species_random_abs_performance$model = "rf_noSpec_oldVars_randomAbs" +msdm_rf_no_species_random_abs_performance$metric = tolower(msdm_rf_no_species_random_abs_performance$metric) +msdm_rf_performance$model = "rf_noSpec_newVars_randomAbs" +``` + +```{r globals, echo = FALSE, cache = TRUE, include = FALSE} +# Regression functions +asym_fit = function(x, y){ + nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) + new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) + data.frame( + x = new_x, + fit = predict(nls_fit, newdata = data.frame(x = new_x)) + ) +} + +glm_fit = function(x, y, family = "binomial"){ + glm_fit = suppressWarnings(glm(y~x, family = family)) + new_x = seq(min(x), max(x), length.out = 100) + data.frame( + x = new_x, + fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") + ) +} + +loess_fit = function(x, y, span = 0.75){ + df = data.frame(x = x, y = y) + loess_fit = loess(y ~ x, data = df, span = span) + new_x = seq(min(x), max(x), length.out = 100) + data.frame( + x = new_x, + fit = predict(loess_fit, newdata = data.frame(x = new_x)) + ) +} + +# Performance table +performance = msdm_rf_performance %>% + bind_rows(msdm_rf_no_species_performance) %>% + bind_rows(msdm_rf_no_species_random_abs_performance) %>% + dplyr::group_by(species, model, metric) %>% + dplyr::summarise(value = mean(value, na.rm = F)) %>% + dplyr::mutate( + value = case_when( + ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5, + ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0, + .default = value + ) + ) %>% + ungroup() + +plot_performance = function(df_plot, y_var, y_label, x_var, x_label, x_log = T, reg_func = loess_fit) { + df_plot = df_plot %>% + dplyr::rename( + x_var = !!x_var, + y_var = !!y_var + ) + + # Calculate regression lines for each model and metric combination + suppressWarnings({ + regression_lines = df_plot %>% + group_by(model) %>% + group_modify( ~ reg_func(.x[["x_var"]], .x[["y_var"]])) + }) + + # Create base plot + plot <- plot_ly() %>% + layout( + title = paste0("Model Performance vs. ", x_label), + xaxis = list(title = x_label, type = ifelse(x_log, "log", "linear")), + yaxis = list(title = y_label), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest' + ) + + # Points + for (model_name in unique(df_plot$model)) { + plot = plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~ x_var, + y = ~ y_var, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~ model, + hoverinfo = 'text', + text = ~ paste( + "Species:", species, "<br>", x_label, ":", x_var, "<br>Value:", round(y_var, 3) + ) + ) + } + + # Add regression lines + for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~ x, + y = ~ fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE + ) + } + + return(plot) +} + +add_partial_residuals = function(df, focus_var, control_vars, family = quasibinomial) { + model_formula <- as.formula(paste("value ~", paste(c(focus_var, control_vars), collapse = " + "))) + model = glm(model_formula, data = df, family = family) + + if (focus_var %in% names(model$coefficients)) { + df[[paste0(focus_var, "_partial")]] = residuals(model, type = "response") + model$coefficients[focus_var] * df[[focus_var]] + } + + return(df) +} + +``` + +## Summary + +This document contrasts the performance of different variations of random forest models. + +## Analysis + +### Quantify drivers of model performance + +```{r prepare_model_df, echo = FALSE, include = FALSE} +# Number of records +df_join = model_data %>% + sf::st_drop_geometry() %>% + dplyr::filter(present == 1) %>% + dplyr::group_by(species) %>% + dplyr::summarise(n_obs = n()) + +performance = performance %>% + dplyr::left_join(df_join, by = "species") + +# Range size +df_join = range_maps %>% + sf::st_transform("+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs") %>% + dplyr::mutate(range_size = as.numeric(sf::st_area(range_maps) / 1000000)) %>% # range in sqkm + sf::st_drop_geometry() + +performance = performance %>% + inner_join(df_join, by = c("species" = "name_matched")) + +# Range coverage +range_maps_gridded = range_maps_gridded %>% + sf::st_transform("+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs") + +df_cells_total = range_maps_gridded %>% + dplyr::rename("species" = name_matched) %>% + group_by(species) %>% + summarise(cells_total = n()) %>% + st_drop_geometry() + +df_cells_occ = range_maps_gridded %>% + st_join(occs_final, join = st_intersects) %>% + filter(name_matched == species) %>% # Filter only intersections of the same species + group_by(species) %>% + summarise(cells_occupied = n_distinct(geometry)) %>% + st_drop_geometry() + +df_join = df_cells_total %>% + dplyr::inner_join(df_cells_occ, by = "species") %>% + dplyr::mutate(range_cov = cells_occupied / cells_total) %>% + dplyr::select(species, range_cov) + +performance = performance %>% + inner_join(df_join, by = "species") + +# Range coverage bias +df_occs_total = occs_final %>% + st_drop_geometry() %>% + group_by(species) %>% + summarise(occs_total = n()) + +df_join = df_occs_total %>% + dplyr::inner_join(df_cells_total, by = "species") %>% + dplyr::inner_join(df_cells_occ, by = "species") %>% + dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1))) %>% + dplyr::select(species, range_bias) + +performance = performance %>% + inner_join(df_join, by = "species") +``` + +### Number of records + +::: panel-tabset +#### Marginal Effects + +::: panel-tabset +##### AUC + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "auc") +plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` + +##### F1 + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "f1") +plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` + +##### Cohen's kappa + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "kappa") +plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` + +##### Accuracy + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "accuracy") +plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` + +##### Precision + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "precision") +plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` + +##### Recall + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "recall") +plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` +::: + +#### Partial Effects + +::: panel-tabset +##### AUC + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "auc") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` + +##### F1 + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "f1") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` + +##### Cohen's kappa + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "kappa") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` + +##### Accuracy + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "accuracy") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` + +##### Precision + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "precision") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` + +##### Recall + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "recall") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "n_obs", x_label = "Number of records") +bslib::card(plot, full_screen = T) +``` +::: +::: + +### Range size + +Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016). + +::: panel-tabset +#### Marginal Effects + +::: panel-tabset +##### AUC + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "auc") +plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_size", x_label = "Range size [km]") +bslib::card(plot, full_screen = T) +``` + +##### F1 + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "f1") +plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_size", x_label = "Range size [km]") +bslib::card(plot, full_screen = T) +``` + +##### Cohen's kappa + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "kappa") +plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_size", x_label = "Range size [km]") +bslib::card(plot, full_screen = T) +``` + +##### Accuracy + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "accuracy") +plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_size", x_label = "Range size [km]") +bslib::card(plot, full_screen = T) +``` + +##### Precision + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "precision") +plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_size", x_label = "Range size [km]") +bslib::card(plot, full_screen = T) +``` + +##### Recall + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "recall") +plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_size", x_label = "Range size [km]") +bslib::card(plot, full_screen = T) +``` +::: + +#### Partial Effects + +::: panel-tabset +##### AUC + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "auc") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_size", x_label = "Range size [km]") +bslib::card(plot, full_screen = T) +``` + +##### F1 + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "f1") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_size", x_label = "Range size [km²]") +bslib::card(plot, full_screen = T) +``` + +##### Cohen's kappa + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "kappa") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_size", x_label = "Range size [km²]") +bslib::card(plot, full_screen = T) +``` + +##### Accuracy + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "accuracy") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_size", x_label = "Range size [km²]") +bslib::card(plot, full_screen = T) +``` + +##### Precision + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "precision") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_size", x_label = "Range size [km²]") +bslib::card(plot, full_screen = T) +``` + +##### Recall + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "recall") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_size", x_label = "Range size [km²]") +bslib::card(plot, full_screen = T) +``` +::: +::: + +### Range coverage + +Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells. + +$$ +RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}} +$$ + +::: panel-tabset +#### Marginal Effects + +::: panel-tabset +##### AUC + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "auc") +plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` + +##### F1 + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "f1") +plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` + +##### Cohen's kappa + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "kappa") +plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` + +##### Accuracy + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "accuracy") +plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` + +##### Precision + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "precision") +plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` + +##### Recall + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "recall") +plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` +::: + +#### Partial Effects + +::: panel-tabset +##### AUC + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "auc") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` + +##### F1 + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "f1") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` + +##### Cohen's kappa + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "kappa") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` + +##### Accuracy + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "accuracy") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` + +##### Precision + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "precision") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` + +##### Recall + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "recall") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_cov", x_label = "Range coverage") +bslib::card(plot, full_screen = T) +``` +::: +::: + +### Range coverage bias + +Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range. + +$$ +RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)} +$$ + +Higher bias values indicate that occurrence records are spatially more clustered within the range of the species. + +::: panel-tabset +#### Marginal Effects + +::: panel-tabset +##### AUC + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "auc") +plot = plot_performance(df_plot, y_var = "value", y_label = "AUC", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` + +##### F1 + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "f1") +plot = plot_performance(df_plot, y_var = "value", y_label = "F1", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` + +##### Cohen's kappa + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "kappa") +plot = plot_performance(df_plot, y_var = "value", y_label = "Kappa", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` + +##### Accuracy + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "accuracy") +plot = plot_performance(df_plot, y_var = "value", y_label = "Accuracy", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` + +##### Precision + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "precision") +plot = plot_performance(df_plot, y_var = "value", y_label = "Precision", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` + +##### Recall + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "recall") +plot = plot_performance(df_plot, y_var = "value", y_label = "Recall", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` +::: + +#### Partial Effects + +::: panel-tabset +##### AUC + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "auc") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "AUC (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` + +##### F1 + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "f1") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "F1 (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` + +##### Cohen's kappa + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "kappa") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias"), family = gaussian) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Kappa (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` + +##### Accuracy + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "accuracy") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Accuracy (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` + +##### Precision + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "precision") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Precision (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` + +##### Recall + +```{r echo = FALSE} +df_plot = dplyr::filter(performance, metric == "recall") %>% + add_partial_residuals(focus_var = "n_obs", control_vars = c("range_size", "range_cov", "range_bias")) +plot = plot_performance(df_plot, y_var = "n_obs_partial", y_label = "Recall (partial residual)", x_var = "range_bias", x_label = "Range coverage bias") +bslib::card(plot, full_screen = T) +``` +::: +::: diff --git a/R/05_02_publication_analysis.R b/R/05_02_publication_analysis.R index 9e2d074..4d347c4 100644 --- a/R/05_02_publication_analysis.R +++ b/R/05_02_publication_analysis.R @@ -13,10 +13,11 @@ library(cito) source("R/utils.R") -load("data/r_objects/model_data_random_abs_extended.RData") -load("data/r_objects/ssdm_results.RData") +load("data/r_objects/model_data.RData") +load("data/r_objects/ssdm_performance.RData") load("data/r_objects/msdm_embed_performance.RData") load("data/r_objects/msdm_onehot_performance.RData") +load("data/r_objects/msdm_rf_performance.RData") load("data/r_objects/functional_groups.RData") load("data/r_objects/sa_polygon.RData") @@ -25,26 +26,27 @@ load("data/r_objects/range_maps.RData") sf::sf_use_s2(use_s2 = FALSE) model_data = model_data %>% - dplyr::filter(!is.na(fold_eval)) %>% dplyr::mutate(species = as.factor(species)) # ------------------------------------------------------------------ # # 1. Collect performance results #### # ------------------------------------------------------------------ # -performance = ssdm_results %>% +performance = ssdm_performance %>% bind_rows(msdm_embed_performance) %>% bind_rows(msdm_onehot_performance) %>% - dplyr::group_by(species, model, metric) %>% - dplyr::summarise(value = mean(value, na.rm = F)) %>% + bind_rows(msdm_rf_performance) %>% dplyr::mutate( metric = factor(tolower(metric), levels = c("auc", "f1", "kappa", "accuracy", "precision", "recall")), - model = factor(model, levels = c("GAM", "GBM", "RF", "NN", "MSDM_onehot", "MSDM_embed")), + model = factor(model, levels = c("gam", "gbm", "rf", "nn", "msdm_onehot", "msdm_embed", "msdm_rf")), value = case_when( ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5, ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0, .default = value ) - ) + ) %>% + dplyr::group_by(species, model, metric) %>% + dplyr::summarise(value = mean(value, na.rm = F)) %>% + ungroup() # ------------------------------------------------------------------ # # 2. Model comparison #### @@ -81,13 +83,13 @@ obs_count = model_data %>% sf::st_drop_geometry() %>% dplyr::filter(present == 1, !is.na(fold_eval)) %>% dplyr::group_by(species) %>% - dplyr::summarise(obs = n()) + dplyr::summarise(n_obs = n()) df_plot = performance %>% dplyr::filter(metric %in% c("auc", "f1", "kappa", "accuracy", "precision", "recall")) %>% dplyr::left_join(obs_count, by = "species") -ggplot(df_plot, aes(x = obs, y = value, color = model, fill = model)) + +ggplot(df_plot, aes(x = n_obs, y = value, color = model, fill = model)) + geom_point(alpha = 0.1) + geom_smooth() + facet_wrap(~ metric, scales = "free_y") + @@ -157,18 +159,24 @@ library(ranger) plot_predictions = function(spec, model_data, raster_data, algorithms){ # Species data load("data/r_objects/range_maps.RData") + load("data/r_objects/sa_polygon.RData") pa_spec = model_data %>% - dplyr::filter(species == !!spec) + dplyr::filter(species == !!spec, record_type == "core") range_spec = range_maps %>% dplyr::filter(name_matched == !!spec) %>% sf::st_transform(sf::st_crs(pa_spec)) - + sa_polygon = sa_polygon %>% + sf::st_transform(crs(raster_data, proj = T)) + # Extract raster values into df bbox_spec = sf::st_bbox(range_spec) %>% - expand_bbox(expansion = 0.25) + expand_bbox(expansion = 0.5) + + raster_crop = raster_data %>% + terra::crop(bbox_spec) %>% + terra::mask(vect(sa_polygon)) - raster_crop = terra::crop(raster_data, bbox_spec) pa_crop = pa_spec[st_intersects(pa_spec, st_as_sfc(bbox_spec), sparse = FALSE),] new_data = raster_crop %>% @@ -191,15 +199,6 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){ } else if(algorithm == "msdm_rf"){ load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData") predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48) - } else if(algorithm == "msdm_rf_random_abs"){ - load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData") - predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48) - } else if(algorithm == "msdm_rf_fit_no_species"){ - load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_full.RData") - predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48) - } else if(algorithm == "msdm_rf_fit_no_species_random_abs"){ - load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_random_abs_full.RData") - predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48) } else if(algorithm == "nn") { load(paste0("data/r_objects/ssdm_results/full_models/", spec, "_nn_fit.RData")) new_data_tmp = dplyr::select(new_data, -species) @@ -218,7 +217,7 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){ p = ggplot() + tidyterra::geom_spatraster(data = as.factor(raster_pred), maxcell = 5e7) + scale_fill_manual(values = c("0" = "black", "1" = "yellow"), name = "Prediction", na.translate = FALSE) + - geom_sf(data = pa_crop, aes(shape = as.factor(present)), color = "#FFFFFF99") + + geom_sf(data = pa_crop, aes(shape = as.factor(present)), color = "#FF000088") + geom_sf(data = range_spec, col = "red", fill = NA) + scale_shape_manual(values = c("0" = 1, "1" = 4), name = "Observation") + theme_minimal() + @@ -236,16 +235,17 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){ # Load raster raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% + stringr::str_subset("CHELSA_bio") %>% stringr::str_sort(numeric = T) raster_data = terra::rast(raster_filepaths) %>% terra::crop(sf::st_bbox(sa_polygon)) %>% terra::project(sf::st_crs(model_data)$input) -specs = sort(sample(levels(model_data$species), 4)) +specs = c("Histiotus velatus", "Hydrochoerus hydrochaeris", "Euryoryzomys legatus", "Eumops trumbulli") for(spec in specs){ - pdf(file = paste0("plots/range_predictions/", spec, " (msdm_rf).pdf"), width = 12) - plots = plot_predictions(spec, model_data, raster_data, algorithms = c("msdm_rf", "msdm_rf_random_abs", "msdm_rf_fit_no_species", "msdm_rf_fit_no_species_random_abs")) + pdf(file = paste0("plots/range_predictions/", spec, ".pdf"), width = 12) + plots = plot_predictions(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "msdm_onehot", "msdm_embed", "msdm_rf")) lapply(plots, print) dev.off() } @@ -256,7 +256,6 @@ for(spec in specs){ load("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData") load("data/r_objects/msdm_onehot_results/msdm_onehot_fit_full.RData") load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData") -load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData") compare_species_predictions = function(model, sample_size){ specs = sample(unique(model_data$species), 2, replace = F) @@ -329,7 +328,7 @@ obs_count = model_data %>% sf::st_drop_geometry() %>% dplyr::filter(present == 1, !is.na(fold_eval)) %>% dplyr::group_by(species) %>% - dplyr::summarise(obs = n()) + dplyr::summarise(n_obs = n()) all_embeddings = lapply(1:5, function(fold){ load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData")) @@ -377,7 +376,7 @@ r_embed_df = data.frame( df_plot = obs_count %>% dplyr::left_join(r_embed_df, by = "species") -ggplot(data = df_plot, aes(x = obs, y = value)) + +ggplot(data = df_plot, aes(x = n_obs, y = value)) + geom_point() + geom_smooth() + labs(title = "Mean and variance of correlation coefficient of species embeddings across folds") + @@ -557,3 +556,5 @@ range_dist_subset = range_dist[species_intersect, species_intersect] e_indices = match(species_intersect, species_lookup$species) embeddings_dist_subset = as.matrix(embeddings_dist)[e_indices, e_indices] FactoMineR::coeffRV(embeddings_dist_subset, range_dist_subset) + +# --> Significant correlation between embeddings and pairwise species distances (range > func > phylo) diff --git a/R/_publish.yml b/R/_publish.yml index c18ea3e..5604c70 100644 --- a/R/_publish.yml +++ b/R/_publish.yml @@ -2,3 +2,9 @@ quarto-pub: - id: 77b30762-b473-447d-be18-d4bbd6261fbf url: 'https://chrkoenig.quarto.pub/sdm-performance-report' + - id: 7777d450-82af-4364-b4c1-b52d7139ade0 + url: 'https://chrkoenig.quarto.pub/rf-nospec-performance-report' +- source: 05_01_performance_report_rf.qmd + quarto-pub: + - id: 98e776a5-fb0b-41f9-8733-446bf7ffb272 + url: 'https://chrkoenig.quarto.pub/sdm-performance-report-rf' -- GitLab