diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R index 03865cb12614e49b881a864410b068e7263f6db9..2726f8cc58de431325a4ede8ffc505981fe65b01 100644 --- a/R/03_02_absence_preparation.R +++ b/R/03_02_absence_preparation.R @@ -31,7 +31,6 @@ sa_polygon = rnaturalearth::ne_countries() %>% sf::st_union() raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% - #raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice stringr::str_sort(numeric = T) # Project (proj string generated with SA bbox coordinates on https://projectionwizard.org) diff --git a/R/04_01_modelling_ssdm.R b/R/04_01_modelling_ssdm.R new file mode 100644 index 0000000000000000000000000000000000000000..c336fe380caec77ba183cdd5284441ceab9193aa --- /dev/null +++ b/R/04_01_modelling_ssdm.R @@ -0,0 +1,298 @@ +library(furrr) +library(dplyr) +library(tidyr) +library(sf) +library(caret) +library(cito) +library(pROC) + +source("R/utils.R") + +load("data/r_objects/model_data.RData") + +# ----------------------------------------------------------------------# +# Run training #### +# ----------------------------------------------------------------------# +species_processed = list.files("data/r_objects/ssdm_results/", pattern = "RData") %>% + stringr::str_remove(".RData") + +data_split = model_data %>% + dplyr::filter(!species %in% species_processed) %>% + dplyr::group_by(species) %>% + dplyr::group_split() + + +for(pa_spec in data_split){ + species = pa_spec$species[1] + print(species) + + if(all(is.na(pa_spec$fold_eval))){ + print("Too few samples") + next + } + + # Define empty result for performance eval + na_performance = list( + AUC = NA, + Accuracy = NA, + Kappa = NA, + Precision = NA, + Recall = NA, + F1 = NA + ) + + # Create factor presence column + pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P")) + + # Outer CV loop (for averaging performance metrics) + performance_cv = lapply(sort(unique(pa_spec$fold_eval)), function(k){ + print(paste("Fold", k)) + + ## Preparations ##### + data_test = dplyr::filter(pa_spec, fold_eval == k) + data_train = dplyr::filter(pa_spec, fold_eval != k) + + if(nrow(data_test) == 0 || nrow(data_train) == 0){ + warning("Too few samples") + } + + # Create inner CV folds for model training + cv_train = blockCV::cv_spatial( + data_train, + column = "present", + k = 5, + progress = F, plot = F, report = F + ) + data_train$fold_train = cv_train$folds_ids + + # Drop geometry + data_train$geometry = NULL + data_test$geometry = NULL + + # Define caret training routine + index_train = lapply(unique(sort(data_train$fold_train)), function(x){ + return(which(data_train$fold_train != x)) + }) + + train_ctrl = caret::trainControl( + search = "random", + classProbs = TRUE, + index = index_train, + summaryFunction = caret::twoClassSummary, + savePredictions = "final", + ) + + # Define predictors + predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") + + ## Random Forest ##### + rf_performance = tryCatch({ + # Fit model + rf_fit = caret::train( + x = data_train[, predictors], + y = data_train$present_fct, + method = "rf", + trControl = train_ctrl, + tuneLength = 4, + verbose = F + ) + + evaluate_model(rf_fit, data_test) + }, error = function(e){ + na_performance + }) + + ## Gradient Boosted Machine #### + gbm_performance = tryCatch({ + gbm_fit = train( + x = data_train[, predictors], + y = data_train$present_fct, + method = "gbm", + trControl = train_ctrl, + tuneLength = 4, + verbose = F + ) + evaluate_model(gbm_fit, data_test) + }, error = function(e){ + na_performance + }) + + ## Generalized Additive Model #### + gam_performance = tryCatch({ + gam_fit = train( + x = data_train[, predictors], + y = data_train$present_fct, + method = "gamSpline", + tuneLength = 4, + trControl = train_ctrl + ) + evaluate_model(gam_fit, data_test) + }, error = function(e){ + na_performance + }) + + ## Neural Network #### + formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'))) + nn_performance = tryCatch({ + nn_fit = dnn( + formula, + data = data_train, + hidden = c(200L, 200L, 200L), + loss = "binomial", + activation = c("sigmoid", "leaky_relu", "leaky_relu"), + epochs = 200L, + burnin = 100L, + lr = 0.001, + batchsize = max(nrow(data_test)/10, 64), + lambda = 0.0001, + dropout = 0.2, + optimizer = config_optimizer("adam", weight_decay = 0.001), + lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 50, factor = 0.7), + early_stopping = 100, + validation = 0.2, + device = "cuda", + verbose = F, + plot = F + ) + + evaluate_model(nn_fit, data_test) + }, error = function(e){ + na_performance + }) + + # Summarize results + performance_summary = tibble( + species = !!species, + obs = nrow(data_train), + fold_eval = k, + model = c("RF", "GBM", "GAM", "NN"), + auc = c(rf_performance$AUC, gbm_performance$AUC, gam_performance$AUC, nn_performance$AUC), + accuracy = c(rf_performance$Accuracy, gbm_performance$Accuracy, gam_performance$Accuracy, nn_performance$Accuracy), + kappa = c(rf_performance$Kappa, gbm_performance$Kappa, gam_performance$Kappa, nn_performance$Kappa), + precision = c(rf_performance$Precision, gbm_performance$Precision, gam_performance$Precision, nn_performance$Precision), + recall = c(rf_performance$Recall, gbm_performance$Recall, gam_performance$Recall, nn_performance$Recall), + f1 = c(rf_performance$F1, gbm_performance$F1, gam_performance$F1, nn_performance$F1) + ) %>% + tidyr::pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value") + + return(performance_summary) + }) + + # Combine and save evaluation results + performance_spec = bind_rows(performance_cv) %>% + dplyr::arrange(fold_eval, model, metric) + + save(performance_spec, file = paste0("data/r_objects/ssdm_results/", species, ".RData")) +} + + +# Combine results +files = list.files("data/r_objects/ssdm_results/", full.names = T, pattern = ".RData") +ssdm_results = lapply(files, function(f){load(f); return(performance_spec)}) %>% + bind_rows() + +save(ssdm_results, file = "data/r_objects/ssdm_results.RData") + +# ----------------------------------------------------------------------# +# Train full models #### +# ----------------------------------------------------------------------# +data_split = model_data %>% + dplyr::filter(!is.na(fold_eval)) %>% + dplyr::mutate(present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))) %>% + dplyr::group_by(species) %>% + dplyr::group_split() + +for(pa_spec in data_split){ + species = pa_spec$species[1] + print(species) + + # Create inner CV folds for model training + cv_train = blockCV::cv_spatial( + pa_spec, + column = "present", + k = 5, + progress = F, plot = F, report = F + ) + pa_spec$fold_train = cv_train$folds_ids + + # Drop geometry + pa_spec$geometry = NULL + pa_spec$geometry = NULL + + # Define caret training routine + index_train = lapply(unique(sort(pa_spec$fold_train)), function(x){ + return(which(pa_spec$fold_train != x)) + }) + + train_ctrl = caret::trainControl( + search = "random", + classProbs = TRUE, + index = index_train, + summaryFunction = caret::twoClassSummary, + savePredictions = "final", + ) + + # Define predictors + predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") + + # Fit models + try({ + rf_fit = caret::train( + x = pa_spec[, predictors], + y = pa_spec$present_fct, + method = "rf", + trControl = train_ctrl, + tuneLength = 4, + verbose = F + ) + save(rf_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_rf_fit.RData")) + }) + + try({ + gbm_fit = train( + x = pa_spec[, predictors], + y = pa_spec$present_fct, + method = "gbm", + trControl = train_ctrl, + tuneLength = 4, + verbose = F + ) + save(gbm_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_gbm_fit.RData")) + }) + + try({ + gam_fit = train( + x = pa_spec[, predictors], + y = pa_spec$present_fct, + method = "gamSpline", + tuneLength = 4, + trControl = train_ctrl + ) + save(gam_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_gam_fit.RData")) + }) + + try({ + formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'))) + nn_fit = dnn( + formula, + data = pa_spec, + hidden = c(200L, 200L, 200L), + loss = "binomial", + activation = c("sigmoid", "leaky_relu", "leaky_relu"), + epochs = 200L, + burnin = 100L, + lr = 0.001, + batchsize = max(nrow(pa_spec)/10, 64), + lambda = 0.0001, + dropout = 0.2, + optimizer = config_optimizer("adam", weight_decay = 0.001), + lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 50, factor = 0.7), + early_stopping = 100, + validation = 0.2, + device = "cuda", + verbose = F, + plot = F + ) + save(nn_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_nn_fit.RData")) + }) +} diff --git a/R/04_01_ssdm_modeling.R b/R/04_01_ssdm_modeling.R deleted file mode 100644 index 661e169f88482ecc0555d822aed62c89f2d30bdb..0000000000000000000000000000000000000000 --- a/R/04_01_ssdm_modeling.R +++ /dev/null @@ -1,199 +0,0 @@ -library(furrr) -library(dplyr) -library(tidyr) -library(sf) -library(caret) -library(cito) -library(pROC) - -source("R/utils.R") - -load("data/r_objects/model_data.RData") - -# ----------------------------------------------------------------------# -# Run training #### -# ----------------------------------------------------------------------# -data_split = model_data %>% - dplyr::group_by(species) %>% - dplyr::group_split() - -future::plan("multisession", workers = 16) - -furrr::future_walk( - .x = data_split, - .options = furrr::furrr_options( - seed = 123, - packages = c("dplyr", "tidyr", "sf", "caret", "cito", "pROC"), - scheduling = FALSE # make sure workers get constant stream of work - ), - .f = function(pa_spec){ - species = pa_spec$species[1] - - if(all(is.na(pa_spec$fold_eval))){ - warning("Too few samples") - return() - } - - # Define empty result for performance eval - na_performance = list( - AUC = NA, - Accuracy = NA, - Kappa = NA, - Precision = NA, - Recall = NA, - F1 = NA - ) - - # Create factor presence column - pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P")) - - # Outer CV loop (for averaging performance metrics) - performance_cv = lapply(sort(unique(pa_spec$fold_eval)), function(k){ - data_test = dplyr::filter(pa_spec, fold_eval == k) - data_train = dplyr::filter(pa_spec, fold_eval != k) - - if(nrow(data_test) == 0 || nrow(data_train) == 0){ - warning("Too few samples") - } - - # Create inner CV folds for model training - cv_train = blockCV::cv_spatial( - data_train, - column = "present", - k = 5, - progress = F, plot = F, report = F - ) - data_train$fold_train = cv_train$folds_ids - - # Drop geometry - data_train$geometry = NULL - data_test$geometry = NULL - - # Define caret training routine ##### - index_train = lapply(unique(sort(data_train$fold_train)), function(x){ - return(which(data_train$fold_train != x)) - }) - - train_ctrl = caret::trainControl( - search = "random", - classProbs = TRUE, - index = index_train, - summaryFunction = caret::twoClassSummary, - savePredictions = "final", - ) - - # Define predictors - predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") - - # Random Forest ##### - rf_performance = tryCatch({ - # Fit model - rf_fit = caret::train( - x = data_train[, predictors], - y = data_train$present_fct, - method = "rf", - trControl = train_ctrl, - tuneLength = 4, - verbose = F - ) - - evaluate_model(rf_fit, data_test) - }, error = function(e){ - na_performance - }) - - # Gradient Boosted Machine #### - gbm_performance = tryCatch({ - gbm_fit = train( - x = data_train[, predictors], - y = data_train$present_fct, - method = "gbm", - trControl = train_ctrl, - tuneLength = 4, - verbose = F - ) - evaluate_model(gbm_fit, data_test) - }, error = function(e){ - na_performance - }) - - # Generalized Additive Model #### - gam_performance = tryCatch({ - gam_fit = train( - x = data_train[, predictors], - y = data_train$present_fct, - method = "gamSpline", - tuneLength = 4, - trControl = train_ctrl - ) - evaluate_model(gam_fit, data_test) - }, error = function(e){ - na_performance - }) - - # Neural Network #### - nn_performance = tryCatch({ - formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'))) - - nn_fit = dnn( - formula, - data = data_train, - hidden = c(100L, 100L, 100L), - loss = "binomial", - activation = c("leaky_relu", "leaky_relu", "leaky_relu"), - epochs = 500L, - burnin = 100L, - lr = 0.001, - batchsize = max(nrow(data_test)/10, 32), - lambda = 0.01, - dropout = 0.2, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 50, factor = 0.7), - early_stopping = 100, - validation = 0.2, - device = "cuda", - verbose = F, - plot = F - ) - - if(nn_fit$successfull == 1){ - evaluate_model(nn_fit, data_test) - } else { - na_performance - } - }, error = function(e){ - na_performance - }) - - # Summarize results - performance_summary = tibble( - species = !!species, - obs = nrow(data_train), - fold_eval = k, - model = c("RF", "GBM", "GAM", "NN"), - auc = c(rf_performance$AUC, gbm_performance$AUC, gam_performance$AUC, nn_performance$AUC), - accuracy = c(rf_performance$Accuracy, gbm_performance$Accuracy, gam_performance$Accuracy, nn_performance$Accuracy), - kappa = c(rf_performance$Kappa, gbm_performance$Kappa, gam_performance$Kappa, nn_performance$Kappa), - precision = c(rf_performance$Precision, gbm_performance$Precision, gam_performance$Precision, nn_performance$Precision), - recall = c(rf_performance$Recall, gbm_performance$Recall, gam_performance$Recall, nn_performance$Recall), - f1 = c(rf_performance$F1, gbm_performance$F1, gam_performance$F1, nn_performance$F1) - ) %>% - tidyr::pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value") - - return(performance_summary) - }) - - # Combine and save evaluation results - performance_spec = bind_rows(performance_cv) - save(performance_spec, file = paste0("data/r_objects/ssdm_results/", species, ".RData")) - } -) - -# ----------------------------------------------------------------------# -# Combine results #### -# ----------------------------------------------------------------------# -files = list.files("data/r_objects/ssdm_results/", full.names = T) -ssdm_results = lapply(files, function(f){load(f); return(performance_spec)}) %>% - bind_rows() - -save(ssdm_results, file = "data/r_objects/ssdm_results.RData") diff --git a/R/04_02_modelling_msdm_embed.R b/R/04_02_modelling_msdm_embed.R new file mode 100644 index 0000000000000000000000000000000000000000..8257a3ae569766c104a064da5d045c62bad96b9f --- /dev/null +++ b/R/04_02_modelling_msdm_embed.R @@ -0,0 +1,112 @@ +library(dplyr) +library(tidyr) +library(cito) + +source("R/utils.R") + +load("data/r_objects/model_data.RData") + +model_data = model_data %>% + dplyr::filter(!is.na(fold_eval)) %>% + dplyr::mutate(species = as.factor(species)) %>% + sf::st_drop_geometry() + +# ----------------------------------------------------------------------# +# Train model #### +# ----------------------------------------------------------------------# +predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") +formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species, dim = 30, train = T, lambda = 0.0001)")) + +# 1. Cross validation +for(fold in 1:5){ + # Prepare data + train_data = dplyr::filter(model_data, fold_eval != fold) + + # Run model + converged = F + while(!converged){ + msdm_embed_fit = dnn( + formula, + data = train_data, + hidden = c(400L, 400L, 400L), + loss = "binomial", + activation = c("sigmoid", "leaky_relu", "leaky_relu"), + epochs = 25000, + lr = 0.01, + baseloss = 1, + batchsize = nrow(train_data), + dropout = 0.3, + burnin = 500, + optimizer = config_optimizer("adam", weight_decay = 0.001), + lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), + early_stopping = 300, + validation = 0.2, + device = "cuda" + ) + + if(sum(!is.na(msdm_embed_fit$losses$valid_l)) > 5000){ + converged = T + } + } + + save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold,".RData")) +} + +# Full model +msdm_embed_fit = dnn( + formula, + data = model_data, + hidden = c(400L, 400L, 400L), + loss = "binomial", + activation = c("sigmoid", "leaky_relu", "leaky_relu"), + epochs = 25000, + lr = 0.01, + baseloss = 1, + batchsize = nrow(model_data), + dropout = 0.3, + burnin = 500, + optimizer = config_optimizer("adam", weight_decay = 0.001), + lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), + early_stopping = 300, + validation = 0.2, + device = "cuda" +) + +save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData")) + +# ----------------------------------------------------------------------# +# Evaluate model #### +# ----------------------------------------------------------------------# +msdm_embed_performance = lapply(1:5, function(fold){ + load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData")) + + test_data_split = model_data %>% + dplyr::filter(fold_eval == fold) %>% + dplyr::group_split(species) + + lapply(test_data_split, function(test_data_spec){ + species = test_data_spec$species[1] + + performance = tryCatch({ + evaluate_model(msdm_embed_fit, test_data_spec) + }, error = function(e){ + list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, + Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, + TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_) + }) + + performance_summary = performance %>% + as_tibble() %>% + mutate( + species = !!species, + obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), + fold_eval = !!fold, + model = "MSDM_embed", + ) %>% + tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") + }) %>% + bind_rows() +}) %>% + bind_rows() + +save(msdm_embed_performance, file = paste0("data/r_objects/msdm_embed_performance.RData")) diff --git a/R/04_02_msdm_multiclass.R b/R/04_02_msdm_multiclass.R deleted file mode 100644 index b2af99b100ec74d44ee1d24d69d47b8254bab1e0..0000000000000000000000000000000000000000 --- a/R/04_02_msdm_multiclass.R +++ /dev/null @@ -1,68 +0,0 @@ -library(dplyr) -library(tidyr) -library(cito) - -source("R/utils.R") - -load("data/r_objects/model_data.RData") - -# ----------------------------------------------------------------------# -# Prepare data #### -# ----------------------------------------------------------------------# -train_data = dplyr::filter(model_data, present == 1, train == 1) # Use only presences for training -test_data = dplyr::filter(model_data, train == 0) # Evaluate on presences + pseudo-absences for comparability with binary models - -predictors = paste0("layer_", 1:19) - -# ----------------------------------------------------------------------# -# Train model #### -# ----------------------------------------------------------------------# -formula = as.formula(paste0("species ~ ", paste(predictors, collapse = '+'))) -msdm_fit_multiclass = dnn( - formula, - data = train_data, - hidden = c(200L, 200L, 200L), - loss = "softmax", - activation = c("leaky_relu", "leaky_relu", "leaky_relu"), - epochs = 5000L, - lr = 0.01, - batchsize = nrow(train_data)/5, - dropout = 0.1, - burnin = 100, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 250, - validation = 0.3, - device = "cuda" -) - -save(msdm_fit_multiclass, file = "data/r_objects/msdm_fit_multiclass.RData") - -# ----------------------------------------------------------------------# -# Evaluate model #### -# ----------------------------------------------------------------------# -data_split = test_data %>% - group_by(species) %>% - group_split() - -msdm_results_multiclass = lapply(data_split, function(data_spec){ - msdm_performance = tryCatch({ - evaluate_multiclass_model(msdm_fit_multiclass, data_spec, k = 10) # Top-k accuracy - }, error = function(e){ - list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) - }) - - performance_summary = tibble( - species = data_spec$species[1], - obs = nrow(dplyr::filter(model_data, species == data_spec$species[1])), - model = "MSDM_multiclass", - auc = msdm_performance$AUC, - accuracy = msdm_performance$Accuracy, - kappa = msdm_performance$Kappa, - precision = msdm_performance$Precision, - recall = msdm_performance$Recall, - f1 = msdm_performance$F1 - ) -}) %>% bind_rows() - -save(msdm_results_multiclass, file = "data/r_objects/msdm_results_multiclass.RData") diff --git a/R/04_03_msdm_embed_raw.R b/R/04_03_msdm_embed_raw.R deleted file mode 100644 index 0775329909439d7f3f6292fe38e02a901b3f0241..0000000000000000000000000000000000000000 --- a/R/04_03_msdm_embed_raw.R +++ /dev/null @@ -1,81 +0,0 @@ -library(dplyr) -library(tidyr) -library(cito) - -source("R/utils.R") - -load("data/r_objects/model_data_pa_sampling.RData") - -model_data = model_data %>% - dplyr::mutate(species_int = as.integer(as.factor(model_data$species))) %>% - sf::st_drop_geometry() - -fold = 1 - -test_data = dplyr::filter(model_data, fold_eval == fold) -train_data = dplyr::filter(model_data, fold_eval != fold) - -# ----------------------------------------------------------------------# -# Train model #### -# ----------------------------------------------------------------------# -predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") -formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, dim = 50, train = T, lambda = 0.0001)")) - -plot(1, type="n", xlab="", ylab="", xlim=c(0, 25000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there -msdm_fit_embedding_raw = dnn( - formula, - data = train_data, - hidden = c(250L, 250L, 250L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 10L, - lr = 0.1, - baseloss = 1, - batchsize = 4096, - dropout = 0.1, - burnin = 500, - lambda = 0.0001, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 150, factor = 0.7), - early_stopping = 400, - validation = 0.2, - device = "cuda" -) - -save(msdm_fit_embedding_raw, file = paste0("data/r_objects/msdm_raw_results/msdm_raw_fold", fold,".RData")) - -# ----------------------------------------------------------------------# -# Evaluate model #### -# ----------------------------------------------------------------------# -load("data/r_objects/msdm_fit_embedding_raw.RData") - -data_split = test_data %>% - split(test_data$species) - -msdm_results_embedding_raw = lapply(data_split, function(pa_spec){ - species = pa_spec$species[1] - pa_spec = pa_spec %>% - dplyr::select(-species, -fold_eval) - - msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_raw, pa_spec) - }, error = function(e){ - list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) - }) - - performance_summary = tibble( - species = !!species, - obs = nrow(dplyr::filter(train_data, species == !!species)), - fold_eval = !!fold, - model = "MSDM_embed_raw", - auc = msdm_performance$AUC, - accuracy = msdm_performance$Accuracy, - kappa = msdm_performance$Kappa, - precision = msdm_performance$Precision, - recall = msdm_performance$Recall, - f1 = msdm_performance$F1 - ) %>% - tidyr::pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value") -}) %>% bind_rows() - -save(msdm_results_embedding_raw, file = paste0("data/r_objects/msdm_raw_results/result_msdm_raw_fold", fold,".RData")) diff --git a/R/04_04_msdm_embed_traits.R b/R/04_04_msdm_embed_traits.R deleted file mode 100644 index fce4d556361aeb8f157d6fb088a97327d433eb85..0000000000000000000000000000000000000000 --- a/R/04_04_msdm_embed_traits.R +++ /dev/null @@ -1,143 +0,0 @@ -library(dplyr) -library(tidyr) -library(cito) - -source("R/utils.R") - -load("data/r_objects/model_data.RData") -load("data/r_objects/func_dist.RData") - -# ----------------------------------------------------------------------# -# Prepare data #### -# ----------------------------------------------------------------------# -model_species = intersect(model_data$species, names(func_dist)) - -model_data_final = model_data %>% - dplyr::filter(species %in% !!model_species) %>% - # dplyr::mutate_at(vars(starts_with("layer")), ~as.vector(scale(.))) %>% # Scaling seems to make things worse often - dplyr::mutate(species_int = as.integer(as.factor(species))) - -train_data = dplyr::filter(model_data_final, train == 1) -test_data = dplyr::filter(model_data_final, train == 0) - -sp_ind = match(model_species, names(func_dist)) -func_dist = as.matrix(func_dist)[sp_ind, sp_ind] - -embeddings = eigen(func_dist)$vectors[,1:20] -predictors = paste0("layer_", 1:19) - -# ----------------------------------------------------------------------# -# Without training the embedding #### -# ----------------------------------------------------------------------# -# 1. Train -formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)")) - -plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there -msdm_fit_embedding_traits_static = dnn( - formula, - data = train_data, - hidden = c(500L, 500L, 500L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 15000L, - lr = 0.01, - baseloss = 1, - batchsize = nrow(train_data), - dropout = 0.1, - burnin = 100, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 250, - validation = 0.3, - device = "cuda", -) -save(msdm_fit_embedding_traits_static, file = "data/r_objects/msdm_fit_embedding_traits_static.RData") - -# 2. Evaluate -# Per species -load("data/r_objects/msdm_fit_embedding_traits_static.RData") -data_split = test_data %>% - group_by(species_int) %>% - group_split() - -msdm_results_embedding_traits_static = lapply(data_split, function(data_spec){ - target_species = data_spec$species[1] - data_spec = dplyr::select(data_spec, -species) - - msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_traits_static, data_spec) - }, error = function(e){ - list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) - }) - - performance_summary = tibble( - species = !!target_species, - obs = length(which(model_data$species == target_species)), - model = "MSDM_embed_informed_traits_static", - auc = msdm_performance$AUC, - accuracy = msdm_performance$Accuracy, - kappa = msdm_performance$Kappa, - precision = msdm_performance$Precision, - recall = msdm_performance$Recall, - f1 = msdm_performance$F1 - ) -}) %>% bind_rows() - -save(msdm_results_embedding_traits_static, file = "data/r_objects/msdm_results_embedding_traits_static.RData") - -# -------------------------------------------------------------------# -# With training the embedding #### -# ------------------------------------------------------------ ------# -formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)")) - -plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there -msdm_fit_embedding_traits_trained = dnn( - formula, - data = train_data, - hidden = c(500L, 500L, 500L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 15000L, - lr = 0.01, - baseloss = 1, - batchsize = nrow(train_data), - dropout = 0.1, - burnin = 100, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 250, - validation = 0.3, - device = "cuda", -) -save(msdm_fit_embedding_traits_trained, file = "data/r_objects/msdm_fit_embedding_traits_trained.RData") - -# 2. Evaluate -load("data/r_objects/msdm_fit_embedding_traits_trained.RData") -data_split = test_data %>% - group_by(species_int) %>% - group_split() - -msdm_results_embedding_traits_trained = lapply(data_split, function(data_spec){ - target_species = data_spec$species[1] - data_spec = dplyr::select(data_spec, -species) - - msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_traits_trained, data_spec) - }, error = function(e){ - list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) - }) - - performance_summary = tibble( - species = !!target_species, - obs = length(which(model_data$species == target_species)), - model = "MSDM_embed_informed_traits_trained", - auc = msdm_performance$AUC, - accuracy = msdm_performance$Accuracy, - kappa = msdm_performance$Kappa, - precision = msdm_performance$Precision, - recall = msdm_performance$Recall, - f1 = msdm_performance$F1 - ) -}) %>% bind_rows() - -save(msdm_results_embedding_traits_trained, file = "data/r_objects/msdm_results_embedding_traits_trained.RData") diff --git a/R/04_05_msdm_embed_phylo.R b/R/04_05_msdm_embed_phylo.R deleted file mode 100644 index 11134f38f5859acd27bc4b2034b01f3db61530ce..0000000000000000000000000000000000000000 --- a/R/04_05_msdm_embed_phylo.R +++ /dev/null @@ -1,143 +0,0 @@ -library(dplyr) -library(tidyr) -library(cito) - -source("R/utils.R") - -load("data/r_objects/model_data.RData") -load("data/r_objects/phylo_dist.RData") - -# ----------------------------------------------------------------------# -# Prepare data #### -# ----------------------------------------------------------------------# -model_species = intersect(model_data$species, colnames(phylo_dist)) - -model_data_final = model_data %>% - dplyr::filter(species %in% !!model_species) %>% - # dplyr::mutate_at(vars(starts_with("layer")), scale) %>% # Scaling seems to make things worse often - dplyr::mutate(species_int = as.integer(as.factor(species))) - -train_data = dplyr::filter(model_data_final, train == 1) -test_data = dplyr::filter(model_data_final, train == 0) - -sp_ind = match(model_species, colnames(phylo_dist)) -phylo_dist = phylo_dist[sp_ind, sp_ind] - -embeddings = eigen(phylo_dist)$vectors[,1:20] -predictors = paste0("layer_", 1:19) - -# ----------------------------------------------------------------------# -# Without training the embedding #### -# ----------------------------------------------------------------------# -# 1. Train -formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)")) - -plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there -msdm_fit_embedding_phylo_static = dnn( - formula, - data = train_data, - hidden = c(500L, 500L, 500L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 15000L, - lr = 0.01, - baseloss = 1, - batchsize = nrow(train_data), - dropout = 0.1, - burnin = 100, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 250, - validation = 0.3, - device = "cuda", -) -save(msdm_fit_embedding_phylo_static, file = "data/r_objects/msdm_fit_embedding_phylo_static.RData") - -# 2. Evaluate -# Per species -load("data/r_objects/msdm_fit_embedding_phylo_static.RData") -data_split = test_data %>% - group_by(species_int) %>% - group_split() - -msdm_results_embedding_phylo_static = lapply(data_split, function(data_spec){ - target_species = data_spec$species[1] - data_spec = dplyr::select(data_spec, -species) - - msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_phylo_static, data_spec) - }, error = function(e){ - list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) - }) - - performance_summary = tibble( - species = !!target_species, - obs = length(which(model_data$species == target_species)), - model = "MSDM_embed_informed_phylo_static", - auc = msdm_performance$AUC, - accuracy = msdm_performance$Accuracy, - kappa = msdm_performance$Kappa, - precision = msdm_performance$Precision, - recall = msdm_performance$Recall, - f1 = msdm_performance$F1 - ) -}) %>% bind_rows() - -save(msdm_results_embedding_phylo_static, file = "data/r_objects/msdm_results_embedding_phylo_static.RData") - -# -------------------------------------------------------------------# -# With training the embedding #### -# ------------------------------------------------------------ ------# -formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)")) - -plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there -msdm_fit_embedding_phylo_trained = dnn( - formula, - data = train_data, - hidden = c(500L, 500L, 500L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 15000L, - lr = 0.01, - baseloss = 1, - batchsize = nrow(train_data), - dropout = 0.1, - burnin = 100, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 250, - validation = 0.3, - device = "cuda", -) -save(msdm_fit_embedding_phylo_trained, file = "data/r_objects/msdm_fit_embedding_phylo_trained.RData") - -# 2. Evaluate -load("data/r_objects/msdm_fit_embedding_phylo_trained.RData") -data_split = test_data %>% - group_by(species_int) %>% - group_split() - -msdm_results_embedding_phylo_trained = lapply(data_split, function(data_spec){ - target_species = data_spec$species[1] - data_spec = dplyr::select(data_spec, -species) - - msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_phylo_trained, data_spec) - }, error = function(e){ - list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) - }) - - performance_summary = tibble( - species = !!target_species, - obs = length(which(model_data$species == target_species)), - model = "MSDM_embed_informed_phylo_trained", - auc = msdm_performance$AUC, - accuracy = msdm_performance$Accuracy, - kappa = msdm_performance$Kappa, - precision = msdm_performance$Precision, - recall = msdm_performance$Recall, - f1 = msdm_performance$F1 - ) -}) %>% bind_rows() - -save(msdm_results_embedding_phylo_trained, file = "data/r_objects/msdm_results_embedding_phylo_trained.RData") diff --git a/R/04_06_msdm_embed_ranges.R b/R/04_06_msdm_embed_ranges.R deleted file mode 100644 index b9bd65d3b3ab247a48dac3929b8f092730bcbe66..0000000000000000000000000000000000000000 --- a/R/04_06_msdm_embed_ranges.R +++ /dev/null @@ -1,144 +0,0 @@ -library(dplyr) -library(tidyr) -library(cito) - -source("R/utils.R") - -load("data/r_objects/model_data.RData") -load("data/r_objects/range_dist.RData") - -# ----------------------------------------------------------------------# -# Prepare data #### -# ----------------------------------------------------------------------# -model_species = intersect(model_data$species, colnames(range_dist)) - -model_data_final = model_data %>% - dplyr::filter(species %in% !!model_species) %>% - # dplyr::mutate_at(vars(starts_with("layer")), scale) %>% # Scaling seems to make things worse often - dplyr::mutate(species_int = as.integer(as.factor(species))) - -train_data = dplyr::filter(model_data_final, train == 1) -test_data = dplyr::filter(model_data_final, train == 0) - -sp_ind = match(model_species, colnames(range_dist)) -range_dist = range_dist[sp_ind, sp_ind] - -embeddings = eigen(range_dist)$vectors[,1:20] -mode(embeddings) = "numeric" -predictors = paste0("layer_", 1:19) - -# ----------------------------------------------------------------------# -# Without training the embedding #### -# ----------------------------------------------------------------------# -# 1. Train -formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)")) - -plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there -msdm_fit_embedding_range_static = dnn( - formula, - data = train_data, - hidden = c(500L, 500L, 500L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 15000L, - lr = 0.01, - baseloss = 1, - batchsize = nrow(train_data), - dropout = 0.1, - burnin = 100, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 250, - validation = 0.3, - device = "cuda", -) -save(msdm_fit_embedding_range_static, file = "data/r_objects/msdm_fit_embedding_range_static.RData") - -# 2. Evaluate -# Per species -load("data/r_objects/msdm_fit_embedding_range_static.RData") -data_split = test_data %>% - group_by(species_int) %>% - group_split() - -msdm_results_embedding_range_static = lapply(data_split, function(data_spec){ - target_species = data_spec$species[1] - data_spec = dplyr::select(data_spec, -species) - - msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_range_static, data_spec) - }, error = function(e){ - list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) - }) - - performance_summary = tibble( - species = !!target_species, - obs = length(which(model_data$species == target_species)), - model = "MSDM_embed_informed_range_static", - auc = msdm_performance$AUC, - accuracy = msdm_performance$Accuracy, - kappa = msdm_performance$Kappa, - precision = msdm_performance$Precision, - recall = msdm_performance$Recall, - f1 = msdm_performance$F1 - ) -}) %>% bind_rows() - -save(msdm_results_embedding_range_static, file = "data/r_objects/msdm_results_embedding_range_static.RData") - -# -------------------------------------------------------------------# -# With training the embedding #### -# ------------------------------------------------------------ ------# -formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)")) - -plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there -msdm_fit_embedding_range_trained = dnn( - formula, - data = train_data, - hidden = c(500L, 500L, 500L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 15000L, - lr = 0.01, - baseloss = 1, - batchsize = nrow(train_data), - dropout = 0.1, - burnin = 100, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 250, - validation = 0.3, - device = "cuda", -) -save(msdm_fit_embedding_range_trained, file = "data/r_objects/msdm_fit_embedding_range_trained.RData") - -# 2. Evaluate -load("data/r_objects/msdm_fit_embedding_range_trained.RData") -data_split = test_data %>% - group_by(species_int) %>% - group_split() - -msdm_results_embedding_range_trained = lapply(data_split, function(data_spec){ - target_species = data_spec$species[1] - data_spec = dplyr::select(data_spec, -species) - - msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_range_trained, data_spec) - }, error = function(e){ - list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) - }) - - performance_summary = tibble( - species = !!target_species, - obs = length(which(model_data$species == target_species)), - model = "MSDM_embed_informed_range_trained", - auc = msdm_performance$AUC, - accuracy = msdm_performance$Accuracy, - kappa = msdm_performance$Kappa, - precision = msdm_performance$Precision, - recall = msdm_performance$Recall, - f1 = msdm_performance$F1 - ) -}) %>% bind_rows() - -save(msdm_results_embedding_range_trained, file = "data/r_objects/msdm_results_embedding_range_trained.RData") diff --git a/R/04_07_msdm_embed_multi_nolonlat.R b/R/04_07_msdm_embed_multi_nolonlat.R deleted file mode 100644 index 15d37e9828461494f0458566989968d75530ee64..0000000000000000000000000000000000000000 --- a/R/04_07_msdm_embed_multi_nolonlat.R +++ /dev/null @@ -1,111 +0,0 @@ -library(dplyr) -library(tidyr) -library(cito) -library(sf) - -source("R/utils.R") - -load("data/r_objects/model_data_pa_sampling.RData") -load("data/r_objects/func_dist.RData") -load("data/r_objects/phylo_dist.RData") -load("data/r_objects/range_dist.RData") - -# ----------------------------------------------------------------------# -# Prepare data #### -# ----------------------------------------------------------------------# -model_species = Reduce( - intersect, - list(unique(model_data$species), colnames(range_dist), colnames(phylo_dist), colnames(func_dist)) -) - -model_data_final = model_data %>% - sf::st_drop_geometry() %>% - dplyr::filter(species %in% !!model_species) %>% - dplyr::mutate(species_int = as.integer(as.factor(species))) - - -test_data = dplyr::filter(model_data_final, fold_eval == 1) -train_data = dplyr::filter(model_data_final, fold_eval != 1) - -# Create embeddings -func_ind = match(model_species, colnames(func_dist)) -func_dist = func_dist[func_ind, func_ind] -func_embeddings = eigen(func_dist)$vectors[,1:20] - -phylo_ind = match(model_species, colnames(phylo_dist)) -phylo_dist = phylo_dist[phylo_ind, phylo_ind] -phylo_embeddings = eigen(phylo_dist)$vectors[,1:20] - -range_ind = match(model_species, colnames(range_dist)) -range_dist = range_dist[range_ind, range_ind] -range_embeddings = eigen(range_dist)$vectors[,1:20] - - -# ----------------------------------------------------------------------# -# Train model #### -# ----------------------------------------------------------------------# -# Define predictors -predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") - -formula = as.formula( - paste0("present ~ ", - paste(predictors, collapse = '+'), - " + e(species_int, weights = func_embeddings, lambda = 0.00001, train = T)", - " + e(species_int, weights = phylo_embeddings, lambda = 0.00001, train = T)", - " + e(species_int, weights = range_embeddings, lambda = 0.00001, train = T)" - ) -) - -plot(1, type="n", xlab="", ylab="", xlim=c(0, 25000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there -msdm_fit_embedding_multi_nolonlat = dnn( - formula, - data = train_data, - hidden = c(200L, 200L, 200L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 30000L, - lr = 0.01, - baseloss = 1, - batchsize = nrow(train_data), - dropout = 0.1, - burnin = 100, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 250, - validation = 0.3, - device = "cuda", -) -save(msdm_fit_embedding_multi_nolonlat, file = "data/r_objects/msdm_fit_embedding_multi_nolonlat.RData") - -# ----------------------------------------------------------------------# -# Evaluate results #### -# ----------------------------------------------------------------------# -load("data/r_objects/msdm_fit_embedding_multi_nolonlat.RData") -data_split = test_data %>% - group_by(species_int) %>% - group_split() - -msdm_results_embedding_multi_nolonlat = lapply(data_split, function(data_spec){ - target_species = data_spec$species[1] - data_spec = dplyr::select(data_spec, -species) - - msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_multi_nolonlat, data_spec) - }, error = function(e){ - list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) - }) - - performance_summary = tibble( - species = !!target_species, - obs = length(which(model_data$species == target_species)), - model = "MSDM_embed_informed_multi_nolonlat", - auc = msdm_performance$AUC, - accuracy = msdm_performance$Accuracy, - kappa = msdm_performance$Kappa, - precision = msdm_performance$Precision, - recall = msdm_performance$Recall, - f1 = msdm_performance$F1 - ) -}) %>% bind_rows() - -save(msdm_results_embedding_multi_nolonlat, file = "data/r_objects/msdm_results_embedding_multi_nolonlat.RData") \ No newline at end of file diff --git a/R/04_07_msdm_oneoff.R b/R/04_07_msdm_oneoff.R deleted file mode 100644 index 4ef2a031a860a28d7f7f6596a94db2d617f85e17..0000000000000000000000000000000000000000 --- a/R/04_07_msdm_oneoff.R +++ /dev/null @@ -1,111 +0,0 @@ -library(dplyr) -library(tidyr) -library(cito) -library(sf) - -source("R/utils.R") - -load("data/r_objects/model_data_pa_sampling.RData") -load("data/r_objects/func_dist.RData") -load("data/r_objects/phylo_dist.RData") -load("data/r_objects/range_dist.RData") - -# ----------------------------------------------------------------------# -# Prepare data #### -# ----------------------------------------------------------------------# -model_species = Reduce( - intersect, - list(unique(model_data$species), colnames(range_dist), colnames(phylo_dist), colnames(func_dist)) -) - -model_data_final = model_data %>% - sf::st_drop_geometry() %>% - dplyr::filter(species %in% !!model_species) %>% - dplyr::mutate(species_int = as.integer(as.factor(species))) - - -test_data = dplyr::filter(model_data_final, fold_eval == 1) -train_data = dplyr::filter(model_data_final, fold_eval != 1) - -# Create embeddings -func_ind = match(model_species, colnames(func_dist)) -func_dist = func_dist[func_ind, func_ind] -func_embeddings = eigen(func_dist)$vectors[,1:20] - -phylo_ind = match(model_species, colnames(phylo_dist)) -phylo_dist = phylo_dist[phylo_ind, phylo_ind] -phylo_embeddings = eigen(phylo_dist)$vectors[,1:20] - -range_ind = match(model_species, colnames(range_dist)) -range_dist = range_dist[range_ind, range_ind] -range_embeddings = eigen(range_dist)$vectors[,1:20] - - -# ----------------------------------------------------------------------# -# Train model #### -# ----------------------------------------------------------------------# -# Define predictors -predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") - -formula = as.formula( - paste0("present ~ ", - paste(predictors, collapse = '+'), - " + e(species_int, weights = func_embeddings, lambda = 0.00001, train = T)", - " + e(species_int, weights = phylo_embeddings, lambda = 0.00001, train = T)", - " + e(species_int, weights = range_embeddings, lambda = 0.00001, train = T)" - ) -) - -plot(1, type="n", xlab="", ylab="", xlim=c(0, 25000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there -msdm_fit_embedding_multi_nolonlat = dnn( - formula, - data = train_data, - hidden = c(400L, 400L, 400L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 30000L, - lr = 0.01, - baseloss = 1, - batchsize = nrow(train_data), - dropout = 0.1, - burnin = 100, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 150, factor = 0.7), - early_stopping = 250, - validation = 0.3, - device = "cuda", -) -save(msdm_fit_embedding_multi_nolonlat, file = "data/r_objects/msdm_fit_embedding_multi_nolonlat.RData") - -# ----------------------------------------------------------------------# -# Evaluate results #### -# ----------------------------------------------------------------------# -load("data/r_objects/msdm_fit_embedding_multi_nolonlat.RData") -data_split = test_data %>% - group_by(species_int) %>% - group_split() - -msdm_results_embedding_multi_nolonlat = lapply(data_split, function(data_spec){ - target_species = data_spec$species[1] - data_spec = dplyr::select(data_spec, -species) - - msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_multi_nolonlat, data_spec) - }, error = function(e){ - list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) - }) - - performance_summary = tibble( - species = !!target_species, - obs = length(which(model_data$species == target_species)), - model = "MSDM_embed_informed_multi_nolonlat", - auc = msdm_performance$AUC, - accuracy = msdm_performance$Accuracy, - kappa = msdm_performance$Kappa, - precision = msdm_performance$Precision, - recall = msdm_performance$Recall, - f1 = msdm_performance$F1 - ) -}) %>% bind_rows() - -save(msdm_results_embedding_multi_nolonlat, file = "data/r_objects/msdm_results_embedding_multi_nolonlat.RData") diff --git a/R/04_08_msdm_embed_multi_lonlat.R b/R/04_08_msdm_embed_multi_lonlat.R deleted file mode 100644 index 2f2e03f497811bd46d7cb1ce7e7f072abc8cfea4..0000000000000000000000000000000000000000 --- a/R/04_08_msdm_embed_multi_lonlat.R +++ /dev/null @@ -1,107 +0,0 @@ -library(dplyr) -library(tidyr) -library(cito) - -source("R/utils.R") - -load("data/r_objects/model_data.RData") -load("data/r_objects/func_dist.RData") -load("data/r_objects/phylo_dist.RData") -load("data/r_objects/range_dist.RData") - -# ----------------------------------------------------------------------# -# Prepare data #### -# ----------------------------------------------------------------------# -model_species = Reduce( - intersect, - list(unique(model_data$species), colnames(range_dist), colnames(phylo_dist), colnames(func_dist)) -) - -model_data_final = model_data %>% - dplyr::filter(species %in% !!model_species) %>% - dplyr::mutate(species_int = as.integer(as.factor(species))) - -train_data = dplyr::filter(model_data_final, train == 1) -test_data = dplyr::filter(model_data_final, train == 0) - -# Create embeddings -func_ind = match(model_species, colnames(func_dist)) -func_dist = func_dist[func_ind, func_ind] -func_embeddings = eigen(func_dist)$vectors[,1:20] - -phylo_ind = match(model_species, colnames(phylo_dist)) -phylo_dist = phylo_dist[phylo_ind, phylo_ind] -phylo_embeddings = eigen(phylo_dist)$vectors[,1:20] - -range_ind = match(model_species, colnames(range_dist)) -range_dist = range_dist[range_ind, range_ind] -range_embeddings = eigen(range_dist)$vectors[,1:20] - - -# ----------------------------------------------------------------------# -# Train model #### -# ----------------------------------------------------------------------# -predictors = c("longitude", "latitude", paste0("layer_", 1:19)) - -formula = as.formula( - paste0("present ~ ", - paste(predictors, collapse = '+'), - " + e(species_int, weights = func_embeddings, lambda = 0.00001, train = F)", - " + e(species_int, weights = phylo_embeddings, lambda = 0.00001, train = F)", - " + e(species_int, weights = range_embeddings, lambda = 0.00001, train = F)" - ) -) - -plot(1, type="n", xlab="", ylab="", xlim=c(0, 25000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there -msdm_fit_embedding_multi_lonlat = dnn( - formula, - data = train_data, - hidden = c(500L, 500L, 500L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 30000L, - lr = 0.01, - baseloss = 1, - batchsize = nrow(train_data), - dropout = 0.1, - burnin = 100, - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 250, - validation = 0.3, - device = "cuda", -) -save(msdm_fit_embedding_multi_lonlat, file = "data/r_objects/msdm_fit_embedding_multi_lonlat.RData") - -# ----------------------------------------------------------------------# -# Evaluate results #### -# ----------------------------------------------------------------------# -load("data/r_objects/msdm_fit_embedding_multi_lonlat.RData") -data_split = test_data %>% - group_by(species_int) %>% - group_split() - -msdm_results_embedding_multi_lonlat = lapply(data_split, function(data_spec){ - target_species = data_spec$species[1] - data_spec = dplyr::select(data_spec, -species) - - msdm_performance = tryCatch({ - evaluate_model(msdm_fit_embedding_multi_lonlat, data_spec) - }, error = function(e){ - list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) - }) - - performance_summary = tibble( - species = !!target_species, - obs = length(which(model_data$species == target_species)), - model = "MSDM_embed_informed_multi_lonlat", - auc = msdm_performance$AUC, - accuracy = msdm_performance$Accuracy, - kappa = msdm_performance$Kappa, - precision = msdm_performance$Precision, - recall = msdm_performance$Recall, - f1 = msdm_performance$F1 - ) -}) %>% bind_rows() - -save(msdm_results_embedding_multi_lonlat, file = "data/r_objects/msdm_results_embedding_multi_lonlat.RData") \ No newline at end of file diff --git a/R/05_01_performance_analysis.qmd b/R/05_01_performance_analysis.qmd deleted file mode 100644 index 17be731c8ac03a0e4ae1cec80991f6b322a97be0..0000000000000000000000000000000000000000 --- a/R/05_01_performance_analysis.qmd +++ /dev/null @@ -1,627 +0,0 @@ ---- -title: "SDM Performance analysis" -format: html -editor: visual -engine: knitr ---- - -```{r init, echo = FALSE, include = FALSE} -library(tidyverse) -library(sf) -library(plotly) -library(DT) - -load("../data/r_objects/model_data_pa_sampling.RData") -load("../data/r_objects/ssdm_results.RData") -load("../data/r_objects/msdm_results_embedding_raw.RData") - -load("../data/r_objects/range_maps.RData") -load("../data/r_objects/range_maps_gridded.RData") -load("../data/r_objects/occs_final.RData") -load("../data/r_objects/functional_groups.RData") -sf::sf_use_s2(use_s2 = FALSE) -``` - -```{r globals, echo = FALSE, include = FALSE} - - -# Count occs per species -obs_count = model_data %>% - sf::st_drop_geometry() %>% - dplyr::filter(present == 1) %>% - dplyr::group_by(species) %>% - dplyr::summarise(obs = n()) - - -# Regression functions -asym_regression = function(x, y){ - nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) - new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) - data.frame( - x = new_x, - fit = predict(nls_fit, newdata = data.frame(x = new_x)) - ) -} - -lin_regression = function(x, y, family = "binomial"){ - glm_fit = suppressWarnings(glm(y~x, family = family)) - new_x = seq(min(x), max(x), length.out = 100) - data.frame( - x = new_x, - fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") - ) -} - -msdm_results = msdm_results_embedding_raw %>% - pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value") %>% - dplyr::select(-obs) %>% - dplyr::mutate( - fold_eval = 1 - ) %>% - drop_na() - -# Performance table -performance = ssdm_results %>% - dplyr::select(-obs) %>% - dplyr::filter(fold_eval == 1, species %in% msdm_results$species) %>% # Only look at first fold - bind_rows(msdm_results) %>% - ungroup() %>% - dplyr::mutate( - value = case_when( - ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accurracy", "precision", "recall")) ~ 0.5, - ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0, - .default = value - ) - ) - -focal_metrics = unique(performance$metric) -``` - -## Summary - -This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest. - -Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling). - -### Modeling overview: - -#### General decisions - -- Randomly sampled pseudo-absences from expanded area of extent of occurrence records (×1.25) -- Balanced presences and absences for each species -- Predictors: all 19 CHELSA bioclim variables -- 70/30 Split of training vs. test data (except for NN models) - -#### sSDM Algorithms - -Random Forest (**SSDM_RF**) - -- Hyperparameter tuning of `mtry` -- Spatial block cross-validation during training - -Generalized boosted machine (**SSDM_GBM**) - -- Hyperparameter tuning across `n.trees` , `interaction.depth` , `shrinkage`, `n.minobsinnode` -- Spatial block cross-validation during training - -Generalized Linear Model (**SSDM_GLM**) - -- Logistic model with binomial link function -- Spatial block cross-validation during training - -Neural Netwok (**SSDM_NN**) - -- Three hidden layers, leaky ReLu activations, binomial loss -- no spatial block cross-validation during training - -#### mSDM Algorithms - -Binary Neural Network with species embedding (**MSDM_embed**) - -- definition: presence \~ environment + embedding(species) -- prediction: probability of occurrence given a set of (environmental) inputs and species identity -- embedding initialized at random -- three hidden layers, sigmoid + leaky ReLu activations, binomial loss - -Binary Neural Network with trait-informed species embedding (**MSDM_embed_informed_trained**) - -- definition: presence \~ environment + embedding(species) -- prediction: probability of occurrence given a set of (environmental) inputs and species identity -- embedding initialized using eigenvectors of functional distance matrix, then further training on data -- three hidden layers, sigmoid + leaky ReLu activations, binomial loss - -Multi-Class Neural Network (**MSDM_multiclass**) - -- definition: species identity \~ environment -- prediction: probability distribution across all observed species given a set of (environmental) inputs -- presence-only data in training -- three hidden layers, leaky ReLu activations, softmax loss -- Top-k based evaluation (k=10, P/A \~ target species in / not among top 10 predictions) - -### Key findings: - -- sSDM algorithms (RF, GBM) outperformed mSDMs in most cases -- mSDMs showed indications of better performance for rare species (\< 10-20 occurrences) -- More occurrence records and larger range sizes tended to improve model performance -- Higher range coverage correlated with better performance -- Range coverage bias and functional group showed some impact but were less consistent -- Convergence problems hampered NN sSDM performance - -## Analysis - -The table below shows the analysed modeling results. - -```{r performance, echo = FALSE, message=FALSE, warnings=FALSE} -DT::datatable(performance) %>% - formatRound(columns="value", digits=3) -``` - -### Number of records - -- Model performance was generally better for species with more observations -- Very poor performance below 50-100 observations - -```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE} -plot_performance_over_frequency = function(df_plot, metric) { - - df_plot = dplyr::filter(df_plot, metric == !!metric) - - # Calculate regression lines for each model and metric combination - suppressWarnings({ - regression_lines = df_plot %>% - group_by(model) %>% - group_modify( ~ asym_regression(.x$obs, .x$value)) - }) - - # Create base plot - plot <- plot_ly() %>% - layout( - title = "Model Performance vs. Number of observations", - xaxis = list(title = "Number of observations", type = "log"), - yaxis = list(title = metric), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest' - ) - - # Points - for (model_name in unique(df_plot$model)) { - plot = plot %>% - add_markers( - data = filter(df_plot, model == model_name, metric %in% focal_metrics), - x = ~ obs, - y = ~ value, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - opacity = 0.6, - name = ~ model, - hoverinfo = 'text', - text = ~ paste( - "Species:", species, "<br>Observations:", obs, "<br>Value:", round(value, 3) - ) - ) - } - - # Add regression lines - for (model_name in unique(df_plot$model)) { - reg_data = dplyr::filter(regression_lines, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~ x, - y = ~ fit, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(fit)'), - showlegend = FALSE - ) - } - - return(plot) -} - -df_plot = performance %>% dplyr::left_join(obs_count, by = "species") - -``` - -::: panel-tabset -#### AUC - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "auc") -bslib::card(plot, full_screen = T) -``` - -#### F1 - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "f1") -bslib::card(plot, full_screen = T) -``` - -#### Cohen's kappa - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "kappa") -bslib::card(plot, full_screen = T) -``` - -#### Accurracy - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "accuracy") -bslib::card(plot, full_screen = T) -``` - -#### Precision - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "precision") -bslib::card(plot, full_screen = T) -``` - -#### Recall - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "recall") -bslib::card(plot, full_screen = T) -``` -::: - -### Range characteristics - -#### Range size - -Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016). - -- Model performance tended to be slightly higher for species with larger range size -- Only RF shows continuous performance improvements beyond range sizes of \~5M km² - -```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE, eval=F} - -df_join = range_maps %>% - dplyr::mutate(range_size = as.numeric(st_area(range_maps) / 1000000)) %>% # range in sqkm - sf::st_drop_geometry() - -df_plot = performance %>% - inner_join(df_join, by = c("species" = "name_matched")) - -# Calculate regression lines for each model and metric combination -suppressWarnings({ - regression_lines = df_plot %>% - group_by(model, metric) %>% - group_modify(~asym_regression(.x$range_size, .x$value)) -}) - -# Create base plot -plot <- plot_ly() %>% - layout( - title = "Model Performance vs. Range size", - xaxis = list(title = "Range size"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -# Add regression lines and confidence intervals for each model -for (model_name in unique(df_plot$model)) { - plot <- plot %>% - add_markers( - data = filter(df_plot, model == model_name), - x = ~range_size, - y = ~value, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - opacity = 0.6, - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range size:", range_size, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -for (model_name in unique(df_plot$model)) { - reg_data = dplyr::filter(regression_lines, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~x, - y = ~fit, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(fit)'), - showlegend = FALSE, - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -bslib::card(plot, full_screen = T) -``` - -#### Range coverage - -Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells. - -$$ -RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}} -$$ - -- Models for species with higher range coverage showed slightly better performance - -```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE, eval=F} -df_cells_total = range_maps_gridded %>% - dplyr::rename("species" = name_matched) %>% - group_by(species) %>% - summarise(cells_total = n()) %>% - st_drop_geometry() - -df_cells_occ <- range_maps_gridded %>% - st_join(occs_final, join = st_intersects) %>% - filter(name_matched == species) %>% # Filter only intersections of the same species - group_by(species) %>% - summarise(cells_occupied = n_distinct(geometry)) %>% - st_drop_geometry() - -df_join = df_cells_total %>% - dplyr::inner_join(df_cells_occ, by = "species") %>% - dplyr::mutate(range_cov = cells_occupied / cells_total) %>% - dplyr::select(species, range_cov) - -df_plot = performance %>% - inner_join(df_join, by = "species") - -# Calculate regression lines for each model and metric combination -suppressWarnings({ - regression_lines = df_plot %>% - group_by(model, metric) %>% - group_modify(~lin_regression(.x$range_cov, .x$value)) -}) - -# Create base plot -plot <- plot_ly() %>% - layout( - title = "Model Performance vs. Range coverage", - xaxis = list(title = "Range coverage"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -# Add regression lines and confidence intervals for each model -for (model_name in unique(df_plot$model)) { - plot <- plot %>% - add_markers( - data = filter(df_plot, model == model_name), - x = ~range_cov, - y = ~value, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - opacity = 0.6, - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range coverage:", range_cov, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -for (model_name in unique(df_plot$model)) { - reg_data = dplyr::filter(regression_lines, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~x, - y = ~fit, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(fit)'), - showlegend = FALSE, - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -bslib::card(plot, full_screen = T) -``` - -#### Range coverage bias - -Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range. - -$$ -RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)} -$$ - -Higher bias values indicate that occurrence records are spatially more clustered within the range of the species. - -- There was no strong relationship between range coverage bias and model performance - -```{r range_coverage_bias, echo = FALSE, message=FALSE, warnings=FALSE, eval=F} -df_occs_total = occs_final %>% - st_drop_geometry() %>% - group_by(species) %>% - summarise(occs_total = n()) - -df_join = df_occs_total %>% - dplyr::inner_join(df_cells_total, by = "species") %>% - dplyr::inner_join(df_cells_occ, by = "species") %>% - dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1))) - -df_plot = performance %>% - inner_join(df_join, by = "species") - -# Calculate regression lines for each model and metric combination -suppressWarnings({ - regression_lines = df_plot %>% - group_by(model, metric) %>% - group_modify(~lin_regression(.x$range_bias, .x$value)) -}) - -# Create base plot -plot <- plot_ly() %>% - layout( - title = "Model Performance vs. Range coverage bias", - xaxis = list(title = "Range coverage bias"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -# Add regression lines and confidence intervals for each model -for (model_name in unique(df_plot$model)) { - plot <- plot %>% - add_markers( - data = filter(df_plot, model == model_name), - x = ~range_bias, - y = ~value, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - opacity = 0.6, - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range coverage bias:", range_bias, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -for (model_name in unique(df_plot$model)) { - reg_data = dplyr::filter(regression_lines, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~x, - y = ~fit, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(fit)'), - showlegend = FALSE, - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - - -bslib::card(plot, full_screen = T) -``` - -### Functional group - -Functional groups were assigned based on taxonomic order. The following groupings were used: - -| Functional group | Taxomic orders | -|--------------------|----------------------------------------------------| -| large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla | -| small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha | -| arboreal | Primates, Pilosa | -| flying | Chiroptera | - -- Models for bats tended to perform slightly worse than for other groups. - -```{r functional_groups, echo = FALSE, message=FALSE, warnings=FALSE, eval=F} -df_plot = performance %>% - dplyr::left_join(functional_groups, by = c("species" = "name_matched")) - -plot <- plot_ly( - data = df_plot, - x = ~functional_group, - y = ~value, - color = ~model, - type = 'box', - boxpoints = "all", - jitter = 1, - pointpos = 0, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Functional group:", functional_group, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] # default value - ) - ) -) - -plot <- plot %>% - layout( - title = "Model Performance vs. Functional Group", - xaxis = list(title = "Functional group"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - boxmode = "group", - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -bslib::card(plot, full_screen = T) -``` diff --git a/R/05_01_performance_analysis_carsten.qmd b/R/05_01_performance_analysis_carsten.qmd deleted file mode 100644 index 7dee2c4fabee65f3b26905f8d549661df50c107b..0000000000000000000000000000000000000000 --- a/R/05_01_performance_analysis_carsten.qmd +++ /dev/null @@ -1,190 +0,0 @@ ---- -title: "SDM Performance analysis" -format: html -editor: visual -engine: knitr ---- - -```{r init, echo = FALSE, include = FALSE} -library(tidyverse) -library(sf) -library(plotly) -library(DT) - -load("../data/r_objects/model_data_pa_sampling.RData") -load("../data/r_objects/ssdm_results.RData") -load("../data/r_objects/msdm_results_embedding_raw.RData") - -load("../data/r_objects/range_maps.RData") -load("../data/r_objects/range_maps_gridded.RData") -load("../data/r_objects/occs_final.RData") -load("../data/r_objects/functional_groups.RData") -sf::sf_use_s2(use_s2 = FALSE) -``` - -```{r globals, echo = FALSE, include = FALSE} - - -# Count occs per species -obs_count = model_data %>% - sf::st_drop_geometry() %>% - dplyr::filter(present == 1) %>% - dplyr::group_by(species) %>% - dplyr::summarise(obs = n()) - - -# Regression functions -asym_regression = function(x, y){ - nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) - new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) - data.frame( - x = new_x, - fit = predict(nls_fit, newdata = data.frame(x = new_x)) - ) -} - -lin_regression = function(x, y, family = "binomial"){ - glm_fit = suppressWarnings(glm(y~x, family = family)) - new_x = seq(min(x), max(x), length.out = 100) - data.frame( - x = new_x, - fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") - ) -} - -msdm_results = msdm_results_embedding_raw %>% - pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value") %>% - dplyr::select(-obs) %>% - dplyr::mutate( - fold_eval = 1 - ) %>% - drop_na() - -# Performance table -performance = ssdm_results %>% - dplyr::select(-obs) %>% - dplyr::filter(fold_eval == 1, species %in% msdm_results$species) %>% # Only look at first fold - bind_rows(msdm_results) %>% - ungroup() %>% - dplyr::mutate( - value = case_when( - ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accurracy", "precision", "recall")) ~ 0.5, - ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0, - .default = value - ) - ) - -focal_metrics = unique(performance$metric) -``` - - -## Analysis - -### Number of records - -```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE} -plot_performance_over_frequency = function(df_plot, metric) { - - df_plot = dplyr::filter(df_plot, metric == !!metric) - - # Calculate regression lines for each model and metric combination - suppressWarnings({ - regression_lines = df_plot %>% - group_by(model) %>% - group_modify( ~ asym_regression(.x$obs, .x$value)) - }) - - # Create base plot - plot <- plot_ly() %>% - layout( - title = "Model Performance vs. Number of observations", - xaxis = list(title = "Number of observations", type = "log"), - yaxis = list(title = metric), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest' - ) - - # Points - for (model_name in unique(df_plot$model)) { - plot = plot %>% - add_markers( - data = filter(df_plot, model == model_name, metric %in% focal_metrics), - x = ~ obs, - y = ~ value, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - opacity = 0.6, - name = ~ model, - hoverinfo = 'text', - text = ~ paste( - "Species:", species, "<br>Observations:", obs, "<br>Value:", round(value, 3) - ) - ) - } - - # Add regression lines - for (model_name in unique(df_plot$model)) { - reg_data = dplyr::filter(regression_lines, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~ x, - y = ~ fit, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(fit)'), - showlegend = FALSE - ) - } - - return(plot) -} - -df_plot = performance %>% dplyr::left_join(obs_count, by = "species") - -``` - -::: panel-tabset -#### AUC - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "auc") -bslib::card(plot, full_screen = T) -``` - -#### F1 - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "f1") -bslib::card(plot, full_screen = T) -``` - -#### Cohen's kappa - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "kappa") -bslib::card(plot, full_screen = T) -``` - -#### Accurracy - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "accuracy") -bslib::card(plot, full_screen = T) -``` - -#### Precision - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "precision") -bslib::card(plot, full_screen = T) -``` - -#### Recall - -```{r echo = FALSE} -plot = plot_performance_over_frequency(df_plot, metric = "recall") -bslib::card(plot, full_screen = T) -``` -::: diff --git a/R/05_01_performance_report.qmd b/R/05_01_performance_report.qmd new file mode 100644 index 0000000000000000000000000000000000000000..03ed7d30b8371534f883db27ea24609efca5404f --- /dev/null +++ b/R/05_01_performance_report.qmd @@ -0,0 +1,375 @@ +--- +title: "SDM Performance analysis" +format: html +editor: visual +engine: knitr +--- + +```{r init, echo = FALSE, include = FALSE} +library(tidyverse) +library(sf) +library(plotly) +library(DT) + +load("../data/r_objects/model_data.RData") +load("../data/r_objects/ssdm_results.RData") +load("../data/r_objects/msdm_embed_performance.RData") + +load("../data/r_objects/range_maps.RData") +load("../data/r_objects/range_maps_gridded.RData") +load("../data/r_objects/occs_final.RData") +load("../data/r_objects/functional_groups.RData") + +sf::sf_use_s2(use_s2 = FALSE) +``` + +```{r globals, echo = FALSE, include = FALSE} +# Count occs per species +obs_count = model_data %>% + sf::st_drop_geometry() %>% + dplyr::filter(present == 1) %>% + dplyr::group_by(species) %>% + dplyr::summarise(obs = n()) + + +# Regression functions +asym_regression = function(x, y){ + nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) + new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) + data.frame( + x = new_x, + fit = predict(nls_fit, newdata = data.frame(x = new_x)) + ) +} + +lin_regression = function(x, y, family = "binomial"){ + glm_fit = suppressWarnings(glm(y~x, family = family)) + new_x = seq(min(x), max(x), length.out = 100) + data.frame( + x = new_x, + fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") + ) +} + +# Performance table +performance = ssdm_results %>% + bind_rows(msdm_embed_performance) %>% + dplyr::group_by(species, model, metric) %>% + dplyr::summarise(value = mean(value, na.rm = F)) %>% + dplyr::mutate( + metric = stringr::str_to_lower(metric), + value = case_when( + ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5, + ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0, + .default = value + ) + ) + + +focal_metrics = unique(performance$metric) + +plot_performance = function(df_plot, metric, x_var, x_label, x_log = T, reg_func = lin_regression) { + df_plot = dplyr::filter(df_plot, metric == !!metric) %>% + dplyr::rename(x_var = !!x_var) + + # Calculate regression lines for each model and metric combination + suppressWarnings({ + regression_lines = df_plot %>% + group_by(model) %>% + group_modify( ~ reg_func(.x$x_var, .x$value)) + }) + + # Create base plot + plot <- plot_ly() %>% + layout( + title = paste0("Model Performance vs. ", x_label), + xaxis = list(title = x_label, type = ifelse(x_log, "log", "linear")), + yaxis = list(title = metric), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest' + ) + + # Points + for (model_name in unique(df_plot$model)) { + plot = plot %>% + add_markers( + data = filter(df_plot, model == model_name, metric %in% focal_metrics), + x = ~ x_var, + y = ~ value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~ model, + hoverinfo = 'text', + text = ~ paste( + "Species:", species, "<br>", x_label, ":", x_var, "<br>Value:", round(value, 3) + ) + ) + } + + # Add regression lines + for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~ x, + y = ~ fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE + ) + } + + return(plot) +} +``` + +## Summary + +This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest. + +Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling). + +### Modeling overview: + +#### General decisions + +- Balanced presences and absences for each species +- Absence sampling from probability surface that balances geographical sampling bias and environmental preferences per species + - higher probability in areas that have been sampled more intensely + - lower probability in areas with environmental conditions similar to presence locations + +{width="100%" height="800"} + +- Eight predictors: + - Min Temperature of Coldest Month (bio6) + - Precipitation of Driest Quarter (bio17) + - Climate Moisture Index (cmi) + - Surface Downwelling Shortwave Flux (rsds) + - Tree canopy cover (igfc) + - Distance to freshwater (dtfw) + - Seasonal inundation(igsw) + - Topographic roughness (roughness) +- Nested cross validation + - Outer (performance evaluation): Spatially blocked 5-fold cross validation for all models + - Inner (model training): Random split (1/5) in cito, Spatially blocked 5-fold cross validation in other models + +#### sSDM Algorithms + +- Four Algorithms: Random Forest (RF), Gradient Boosting Machine (GBM), Generalized Additive Model (GAM), Neural Network (NN) + +- NN: Manual hyperparameter tuning, same settings across species + +- RF + GBM + GAM: Automated hyperparameter tuning (4 random combinations) per species + +#### mSDM Algorithms + +Multispecies Neural Network with species embedding (**MSDM_embed**) + +- prediction: probability of occurrence given a set of (environmental) inputs and species identity +- embedding initialized at random +- three hidden layers, sigmoid + leaky ReLu activations, binomial loss + +### Key findings: + +TBA + +## Analysis + +The table below shows the analysed modeling results. + +```{r performance, echo = FALSE, message=FALSE, warnings=FALSE} +DT::datatable(performance) %>% + formatRound(columns="value", digits=3) +``` + +### Number of records + +- Model performance was generally better for species with more observations +- Very poor performance below 50-100 observations + +```{r, echo = FALSE, message=FALSE, warnings=FALSE} +df_plot = performance %>% + dplyr::left_join(obs_count, by = "species") +``` + +::: panel-tabset +#### AUC + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "auc", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` + +#### F1 + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "f1", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` + +#### Cohen's kappa + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "kappa", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` + +#### Accuracy + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "accuracy", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` + +#### Precision + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "precision", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` + +#### Recall + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "recall", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` +::: + +### Range size + +Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016). + +```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE} +df_join = range_maps %>% + dplyr::mutate(range_size = as.numeric(sf::st_area(range_maps) / 1000000)) %>% # range in sqkm + sf::st_drop_geometry() + +df_plot = performance %>% + inner_join(df_join, by = c("species" = "name_matched")) +``` + +::: panel-tabset +#### AUC + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "auc", x_var = "range_size", x_label = "Range size") +bslib::card(plot, full_screen = T) +``` + +#### F1 + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "f1", x_var = "range_size", x_label = "Range size") +bslib::card(plot, full_screen = T) +``` + +#### Cohen's kappa + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "kappa", x_var = "range_size", x_label = "Range size", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` + +#### Accuracy + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_size", x_label = "Range size") +bslib::card(plot, full_screen = T) +``` + +#### Precision + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "precision", x_var = "range_size", x_label = "Range size") +bslib::card(plot, full_screen = T) +``` + +#### Recall + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "recall", x_var = "range_size", x_label = "Range size") +bslib::card(plot, full_screen = T) +``` +::: + +### Range coverage + +Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells. + +$$ +RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}} +$$ + +```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE} +occs_final_unproj = occs_final %>% sf::st_transform("EPSG:4326") + +df_cells_total = range_maps_gridded %>% + dplyr::rename("species" = name_matched) %>% + group_by(species) %>% + summarise(cells_total = n()) %>% + st_drop_geometry() + +df_cells_occ <- range_maps_gridded %>% + st_join(occs_final_unproj, join = st_intersects) %>% + filter(name_matched == species) %>% # Filter only intersections of the same species + group_by(species) %>% + summarise(cells_occupied = n_distinct(geometry)) %>% + st_drop_geometry() + +df_join = df_cells_total %>% + dplyr::inner_join(df_cells_occ, by = "species") %>% + dplyr::mutate(range_cov = cells_occupied / cells_total) %>% + dplyr::select(species, range_cov) + +df_plot = performance %>% + inner_join(df_join, by = "species") +``` + +::: panel-tabset +#### AUC + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "auc", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` + +#### F1 + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "f1", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` + +#### Cohen's kappa + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "kappa", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` + +#### Accuracy + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` + +#### Precision + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "precision", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` + +#### Recall + +```{r echo = FALSE} +plot = plot_performance(df_plot, metric = "recall", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) +bslib::card(plot, full_screen = T) +``` +::: diff --git a/R/05_02_MSDM_comparison.qmd b/R/05_02_MSDM_comparison.qmd deleted file mode 100644 index b70af796777e659e9da4f3b62acda73a5ad89c2c..0000000000000000000000000000000000000000 --- a/R/05_02_MSDM_comparison.qmd +++ /dev/null @@ -1,779 +0,0 @@ ---- -title: "mSDM comparison" -format: html -editor: visual -engine: knitr ---- - -```{r init, echo = FALSE, include = FALSE} -library(tidyverse) -library(sf) -library(plotly) -library(DT) -library(shiny) - -load("../data/r_objects/simple_cv/msdm_results_embedding_raw.RData") -load("../data/r_objects/simple_cv/msdm_results_embedding_traits_static.RData") -load("../data/r_objects/simple_cv/msdm_results_embedding_traits_trained.RData") -load("../data/r_objects/simple_cv/msdm_results_embedding_phylo_static.RData") -load("../data/r_objects/simple_cv/msdm_results_embedding_phylo_trained.RData") -load("../data/r_objects/simple_cv/msdm_results_embedding_range_static.RData") -load("../data/r_objects/simple_cv/msdm_results_embedding_range_trained.RData") -load("../data/r_objects/simple_cv/msdm_results_embedding_multi_nolonlat.RData") -load("../data/r_objects/simple_cv/msdm_results_embedding_multi_lonlat.RData") - -sf::sf_use_s2(use_s2 = FALSE) -``` - -```{r globals, echo = FALSE, include = FALSE} -# Select metrics -focal_metrics = c("auc", "f1", "accuracy") # There's a weird bug in plotly that scrambles up lines when using more than three groups - -# Prepare final dataframes -results_embedding_raw = msdm_results_embedding_raw %>% - dplyr::mutate( - embedding = "raw", - training = "trained" - ) - -results_embedding_informed = c( - "msdm_results_embedding_phylo_static", - "msdm_results_embedding_phylo_trained", - "msdm_results_embedding_range_static", - "msdm_results_embedding_range_trained", - "msdm_results_embedding_traits_static", - "msdm_results_embedding_traits_trained", - "msdm_results_embedding_multi_nolonlat", - "msdm_results_embedding_multi_lonlat" -) - -results_embedding_informed_merged = lapply(results_embedding_informed, function(df_name){ - df = get(df_name) - name_split = str_split_1(df_name, pattern = "_") - df$embedding = name_split[4] - df$training = name_split[5] - assign(df_name, df) -}) %>% bind_rows() - -results_final = results_embedding_raw %>% - bind_rows(results_embedding_informed_merged) %>% - drop_na() %>% - mutate( - model = recode( - model, - "MSDM_embed" = "M_STD", - "MSDM_embed_informed_phylo_static" = "M_PS", - "MSDM_embed_informed_phylo_trained" = "M_PT", - "MSDM_embed_informed_traits_static" = "M_TS", - "MSDM_embed_informed_traits_trained" = "M_TT", - "MSDM_embed_informed_range_static" = "M_RS", - "MSDM_embed_informed_range_trained" = "M_RT", - "MSDM_embed_informed_multi_nolonlat" = "M_MT_nolonlat", - "MSDM_embed_informed_multi_lonlat" = "M_MT_lonlat" - ), - across(all_of(focal_metrics), round, 3) - ) - -results_final_long = results_final %>% - tidyr::pivot_longer(focal_metrics, names_to = "metric", values_to = "value") - -delta_performance = results_final %>% - dplyr::select(all_of(c("species", "model", focal_metrics))) %>% - tidyr::pivot_wider(id_cols = species, names_from = model, values_from = c(auc, accuracy, f1)) %>% - dplyr::mutate( - across(starts_with("auc_M_"), ~ . - auc_M_STD), - across(starts_with("accuracy_M_"), ~ . - accuracy_M_STD), - across(starts_with("f1_M_"), ~ . - f1_M_STD) - ) %>% - dplyr::select(-auc_M_STD, -accuracy_M_STD, -f1_M_STD) %>% - pivot_longer(-species, names_to = c("metric", "model"), names_pattern = "(^[a-zA-Z1-9]+)_(.+$)", values_to = "delta") - -# Regression functions -asym_regression = function(x, y){ - nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) - new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) - data.frame( - x = new_x, - fit = predict(nls_fit, newdata = data.frame(x = new_x)) - ) -} - -lin_regression = function(x, y){ - glm_fit = suppressWarnings(glm(y~x, family = "binomial")) - new_x = seq(min(x), max(x), length.out = 100) - data.frame( - x = new_x, - fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") - ) -} -``` - -```{r create_summaries, echo = FALSE} -auc_overview = results_final %>% - pivot_wider(names_from = model, values_from = auc, id_cols = c(species, obs)) %>% - dplyr::arrange(obs) - -accuracy_overview = results_final %>% - pivot_wider(names_from = model, values_from = accuracy, id_cols = c(species, obs)) %>% - dplyr::arrange(obs) - -f1_overview = results_final %>% - pivot_wider(names_from = model, values_from = f1, id_cols = c(species, obs)) %>% - dplyr::arrange(obs) -``` - -## *Model overview* - -### *Performance summary* - -```{r echo = FALSE, message=FALSE, warnings=FALSE} -p_summary = results_final_long %>% - group_by(model, metric) %>% - dplyr::summarize(value = round(mean(value, na.rm = T), 3)) %>% - pivot_wider(names_from = model, values_from = value) %>% - select(metric, M_STD, M_TS, M_PS, M_RS, M_TT, M_PT, M_RT) - -datatable(p_summary) -``` - -Exact results for different performance metrics across all models are shown below. - -::: panel-tabset -### *AUC* - -```{r echo = FALSE} -datatable( - auc_overview, - options = list( - pageLength = 10, - initComplete = htmlwidgets::JS( - "function(settings, json) { - $(this.api().table().container()).css({'font-size': '10pt'}); - }" - ), - autoWidth = TRUE, - columnDefs = list(list(width = "250px", targets = 1)) - ) -) -``` - -### *Accuracy* - -```{r echo = FALSE} -datatable( - accuracy_overview, - options = list( - pageLength = 10, - initComplete = htmlwidgets::JS( - "function(settings, json) { - $(this.api().table().container()).css({'font-size': '10pt'}); - }" - ), - autoWidth = TRUE, - columnDefs = list(list(width = "250px", targets = 1)) - ) -) -``` - -### *F1 score* - -```{r echo = FALSE} -datatable( - f1_overview, - options = list( - pageLength = 10, - initComplete = htmlwidgets::JS( - "function(settings, json) { - $(this.api().table().container()).css({'font-size': '10pt'}); - }" - ), - autoWidth = TRUE, - columnDefs = list(list(width = "250px", targets = 1)) - ) -) -``` -::: - -## *Number of records* - -```{r performance_vs_occurrences, echo = FALSE, message=FALSE, warnings=FALSE} -# Dropdown options -plotly_buttons = list() -for(metric in focal_metrics){ - plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", metric), label = metric) -} - -df_plot = results_final_long - -# Calculate regression lines for each model and metric combination -suppressWarnings({ - regression_lines = df_plot %>% - group_by(model, metric) %>% - group_modify(~asym_regression(.x$obs, .x$value)) -}) - -# Create base plot -plot <- plot_ly() %>% - layout( - title = "Model Performance vs. Number of observations", - xaxis = list(title = "Number of observations", type = "log"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -# Points -for (model_name in unique(df_plot$model)) { - plot = plot %>% - add_markers( - data = filter(df_plot, model == model_name), - x = ~obs, - y = ~value, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - opacity = 0.6, - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Observations:", obs, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -# Add regression lines -for(model_name in unique(df_plot$model)){ - reg_data = dplyr::filter(regression_lines, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~x, - y = ~fit, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(fit)'), - showlegend = FALSE, - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -bslib::card(plot, full_screen = T) -``` - -## *Relative Performance* - -```{r delta, echo = FALSE, message=FALSE, warnings=FALSE} -results_ranked_obs = results_final_long %>% - group_by(species, metric) %>% - mutate(rank = rank(value)) - -reglines = results_ranked_obs %>% - group_by(model, metric) %>% - group_modify(~as.data.frame(loess.smooth(.x$obs, .x$rank))) - - -# The table below summarizes the relative performance of the models across different observation frequency ranges. The `rank` column indicates the model's performance rank compared to all other models for a given combination of model and metric. The subsequent columns, `(1,10]`, `(10,25]`, ..., `(5000, Inf]`, represent bins of observation frequency. The values in these columns show how many times the model's performance was ranked at the specified `rank` within the respective frequency range. - -# freq_thresholds = c(1, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, Inf) -# -# df_print = performance %>% -# mutate(freq_class = cut(obs, freq_thresholds, dig.lab = 5)) %>% -# group_by(species, metric, freq_class) %>% -# dplyr::mutate(rank = order(value, decreasing = T)) %>% -# group_by(model, metric, rank, freq_class) %>% -# tally() %>% -# pivot_wider(names_from = freq_class, values_from = n) %>% -# dplyr::select("model", "metric", "rank", "(1,10]", "(10,25]", -# "(25,50]", "(50,100]", "(100,250]", "(250,500]", -# "(500,1000]", "(1000,2500]", "(2500,5000]", "(5000,Inf]") %>% -# dplyr::arrange(metric, rank, model) %>% -# replace(is.na(.), 0) -# -# DT::datatable(df_print) -``` - -::: panel-tabset -### *AUC* - -```{r echo = FALSE} -df_plot = dplyr::filter(results_ranked_obs, metric == "auc") -reglines_plot = dplyr::filter(reglines, metric == "auc") - -plot = plot_ly( - data = df_plot, - x = ~obs, - y = ~rank, - color = ~model, - type = 'scatter', - mode = 'markers', - opacity = 0.5 -) %>% - layout( - yaxis = list(title = "rank (lower is better)"), - xaxis = list(title = "number of observations", type = "log"), # log scale for x-axis - showlegend = TRUE - ) - - -for(model_name in unique(df_plot$model)){ - reg_data = dplyr::filter(reglines_plot, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~x, - y = ~y, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(smooth)'), - showlegend = FALSE, - ) -} - -bslib::card(plot, full_screen = T) -``` - -### *Accuracy* - -```{r echo = FALSE} -df_plot = dplyr::filter(results_ranked_obs, metric == "accuracy") -reglines_plot = dplyr::filter(reglines, metric == "accuracy") - -plot = plot_ly( - data = df_plot, - x = ~obs, - y = ~rank, - color = ~model, - type = 'scatter', - mode = 'markers', - opacity = 0.5 -) %>% - layout( - yaxis = list(title = "rank (lower is better)"), - xaxis = list(title = "number of observations", type = "log"), # log scale for x-axis - showlegend = TRUE - ) - - -for(model_name in unique(df_plot$model)){ - reg_data = dplyr::filter(reglines_plot, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~x, - y = ~y, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(smooth)'), - showlegend = FALSE, - ) -} - -bslib::card(plot, full_screen = T) -``` - -### *F1 score* - -```{r echo = FALSE} -df_plot = dplyr::filter(results_ranked_obs, metric == "f1") -reglines_plot = dplyr::filter(reglines, metric == "f1") - -plot = plot_ly( - data = df_plot, - x = ~obs, - y = ~rank, - color = ~model, - type = 'scatter', - mode = 'markers', - opacity = 0.5 -) %>% - layout( - yaxis = list(title = "rank (lower is better)"), - xaxis = list(title = "number of observations", type = "log"), # log scale for x-axis - showlegend = TRUE - ) - - -for(model_name in unique(df_plot$model)){ - reg_data = dplyr::filter(reglines_plot, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~x, - y = ~y, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(smooth)'), - showlegend = FALSE, - ) -} - -bslib::card(plot, full_screen = T) -``` -::: - -## *Trait space* - -```{r trait_pca, echo = FALSE, message=FALSE, warnings=FALSE} -load("../data/r_objects/traits_proc.RData") - -# Preprocess traits for PCA -traits_num = traits_proc %>% - ungroup() %>% - dplyr::mutate(temp_id = row_number()) %>% - pivot_wider( - names_from = ForStrat.Value, - values_from = temp_id, - values_fn = function(x) 1, - values_fill = 0, - names_prefix = "ForStrat." - ) %>% - drop_na(species) %>% - column_to_rownames(var = "species") %>% - scale() - -# Run PCA -traits_pca = prcomp(traits_num) -species_vcts = traits_pca$x %>% - as.data.frame() %>% - rownames_to_column(var = "species") - -df_performance_vs_traits = dplyr::inner_join(delta_performance, species_vcts) -``` - -Functional traits for `r I(nrow(traits_proc))` species were used to construct a multidimensional trait space. Before ordination, categorical variables were converted into dummy variables, and all variables were scaled and centered. A Principal Component Analysis (PCA) was performed on the preprocessed dataset, and the first three axes are used to visualize species' positions within the trait space. For reference, the rotation vectors of the original traits are also plotted. - -::: panel-tabset -#### *AUC* - -```{r auc_trait_space, echo = FALSE, message=FALSE, warnings=FALSE} -# Create plot df -df_plot = dplyr::filter(df_performance_vs_traits, metric == "auc", !is.na(delta)) %>% - group_by(species) %>% - mutate( - PC1 = PC1 + runif(1, -0.15, 0.15), # add jitter - PC2 = PC2 + runif(1, -0.15, 0.15) # add jitter - ) - -# Buttons -plotly_buttons = list() -for(model in c("M_PS", "M_PT", "M_TS", "M_TT", "M_RS", "M_RT")){ - plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", model), label = model) -} - -# Create plot -plot <- plot_ly( - marker = list( - color = ~delta, - colors = colorRamp(c("blue", "lightgrey", "red")), - cauto = F, - cmin = -1, - cmid = 0, - cmax = 1, - opacity = 0.6, - colorbar = list( - title = "\u0394 AUC", - titleside = "right", - x = 1.1, # Adjust x position of colorbar - y = 0.5 # Adjust y position of colorbar - ) - )) %>% - add_markers( - data = df_plot, - x = ~PC1, - y = ~PC2, - z = ~PC3, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>\u0394:", round(delta, 3)), - transforms = list( - list( - type = 'filter', - target = ~model, - operation = '=', - value = "M_TS" - ) - ) - ) %>% - layout( - title = "Relative model performance (trait space)", - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -trait_loadings = traits_pca$rotation * traits_pca$sdev -for (i in 1:nrow(trait_loadings)) { - plot <- plot %>% - add_trace( - x = c(0, trait_loadings[i, 1]), - y = c(0, trait_loadings[i, 2]), - z = c(0, trait_loadings[i, 3]), - type = "scatter3d", - mode = "lines+markers+text", - line = list( - color = "black", - width = 2, - arrow = list( - end = 1, - type = "open", - length = 0.1 - ) - ), - marker = list( - color = "black", - size = 1, - symbol="arrow" - ), - text = rownames(trait_loadings)[i], - textposition = "top right", - opacity = 1, - hoverinfo = "text", - text = rownames(trait_loadings)[i], - showlegend = FALSE - ) -} - - -bslib::card(plot, full_screen = T) -``` - -#### *Accuracy* - -```{r accuracy_trait_space, echo = FALSE, message=FALSE, warnings=FALSE} -# Create plot df -df_plot = dplyr::filter(df_performance_vs_traits, metric == "accuracy", !is.na(delta)) %>% - group_by(species) %>% - mutate( # add jitter - PC1 = PC1 + runif(1, -0.15, 0.15), - PC2 = PC2 + runif(1, -0.15, 0.15), - PC3 = PC3 + runif(1, -0.15, 0.15) - ) - -# Buttons -plotly_buttons = list() -for(model in c("M_PS", "M_PT", "M_TS", "M_TT", "M_RS", "M_RT")){ - plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", model), label = model) -} - -# Create plot -plot <- plot_ly( - marker = list( - color = ~delta, - colors = colorRamp(c("blue", "lightgrey", "red")), - cauto = F, - cmin = -1, - cmid = 0, - cmax = 1, - opacity = 0.6, - colorbar = list( - title = "\u0394 Accuracy", - titleside = "right", - x = 1.1, # Adjust x position of colorbar - y = 0.5 # Adjust y position of colorbar - ) - )) %>% - add_markers( - data = df_plot, - x = ~PC1, - y = ~PC2, - z = ~PC3, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>\u0394:", round(delta, 3)), - transforms = list( - list( - type = 'filter', - target = ~model, - operation = '=', - value = "M_TS" - ) - ) - ) %>% - layout( - title = "Relative model performance (trait space)", - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -trait_loadings = traits_pca$rotation * traits_pca$sdev -for (i in 1:nrow(trait_loadings)) { - plot <- plot %>% - add_trace( - x = c(0, trait_loadings[i, 1]), - y = c(0, trait_loadings[i, 2]), - z = c(0, trait_loadings[i, 3]), - type = "scatter3d", - mode = "lines+markers+text", - line = list( - color = "black", - width = 2, - arrow = list( - end = 1, - type = "open", - length = 0.1 - ) - ), - marker = list( - color = "black", - size = 1, - symbol="arrow" - ), - text = rownames(trait_loadings)[i], - textposition = "top right", - opacity = 1, - hoverinfo = "text", - text = rownames(trait_loadings)[i], - showlegend = FALSE - ) -} - - -bslib::card(plot, full_screen = T) -``` - -#### *F1* - -```{r f1_trait_space, echo = FALSE, message=FALSE, warnings=FALSE} -# Create plot df -df_plot = dplyr::filter(df_performance_vs_traits, metric == "f1", !is.na(delta)) %>% - group_by(species) %>% - mutate( - PC1 = PC1 + runif(1, -0.15, 0.15), # add jitter - PC2 = PC2 + runif(1, -0.15, 0.15) # add jitter - ) - -# Buttons -plotly_buttons = list() -for(model in c("M_PS", "M_PT", "M_TS", "M_TT", "M_RS", "M_RT")){ - plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", model), label = model) -} - -# Create plot -plot <- plot_ly( - marker = list( - color = ~delta, - colors = colorRamp(c("blue", "lightgrey", "red")), - cauto = F, - cmin = -1, - cmid = 0, - cmax = 1, - opacity = 0.6, - colorbar = list( - title = "\u0394 F1", - titleside = "right", - x = 1.1, # Adjust x position of colorbar - y = 0.5 # Adjust y position of colorbar - ) - )) %>% - add_markers( - data = df_plot, - x = ~PC1, - y = ~PC2, - z = ~PC3, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>\u0394:", round(delta, 3)), - transforms = list( - list( - type = 'filter', - target = ~model, - operation = '=', - value = "M_TS" - ) - ) - ) %>% - layout( - title = "Relative model performance (trait space)", - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -trait_loadings = traits_pca$rotation * traits_pca$sdev -for (i in 1:nrow(trait_loadings)) { - plot <- plot %>% - add_trace( - x = c(0, trait_loadings[i, 1]), - y = c(0, trait_loadings[i, 2]), - z = c(0, trait_loadings[i, 3]), - type = "scatter3d", - mode = "lines+markers+text", - line = list( - color = "black", - width = 2, - arrow = list( - end = 1, - type = "open", - length = 0.1 - ) - ), - marker = list( - color = "black", - size = 1, - symbol="arrow" - ), - text = rownames(trait_loadings)[i], - textposition = "top right", - opacity = 1, - hoverinfo = "text", - text = rownames(trait_loadings)[i], - showlegend = FALSE - ) -} - -bslib::card(plot, full_screen = T) -``` -::: - -### *Taxonomy* - -```{r taxonomy, echo = FALSE, message=FALSE, warnings=FALSE} -# load("../data/r_objects/functional_groups.RData") -# -# df_plot = results_final %>% -# dplyr::left_join(functional_groups, by = c("species" = "name_matched")) -# -# plot = -# -# bslib::card(plot, full_screen = T) - -``` diff --git a/R/05_02_publication_analysis.R b/R/05_02_publication_analysis.R new file mode 100644 index 0000000000000000000000000000000000000000..babf048c184351428ee8b10fb1652c86a68b133c --- /dev/null +++ b/R/05_02_publication_analysis.R @@ -0,0 +1,382 @@ +# General packages +library(tidyverse) +library(patchwork) + +# Geo packages +library(terra) +library(sf) +library(geos) + +# Stats packages +library(Rtsne) +library(cito) + +source("R/utils.R") + +load("data/r_objects/model_data.RData") +load("data/r_objects/ssdm_results.RData") +load("data/r_objects/msdm_embed_performance.RData") +load("data/r_objects/functional_groups.RData") + +load("data/r_objects/sa_polygon.RData") +load("data/r_objects/range_maps.RData") + +sf::sf_use_s2(use_s2 = FALSE) + +model_data = model_data %>% + dplyr::filter(!is.na(fold_eval)) %>% + dplyr::mutate(species = as.factor(species)) + +# ------------------------------------------------------------------ # +# 1. Collect performance results #### +# ------------------------------------------------------------------ # +performance = ssdm_results %>% + bind_rows(msdm_embed_performance) %>% + dplyr::group_by(species, model, metric) %>% + dplyr::summarise(value = mean(value, na.rm = F)) %>% + dplyr::mutate( + metric = stringr::str_to_lower(metric), + value = case_when( + ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5, + ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0, + .default = value + ) + ) + +# ------------------------------------------------------------------ # +# 2. Model comparison #### +# ------------------------------------------------------------------ # +## Overall model Comparison #### +df_plot = performance %>% + dplyr::filter(metric %in% c("auc", "f1", "kappa", "accuracy", "precision", "recall")) + +ggplot(df_plot, aes(x = model, y = value, color = model)) + + geom_boxplot(alpha = 0.5, outlier.shape = 16) + + facet_wrap(~ metric, scales = "free_y") + + labs( + title = "Model Performance Across Metrics", + x = "Model", + y = "Value", + color = "Model", + ) + + theme_minimal(base_size = 14) + + theme( + strip.text = element_text(face = "bold"), + axis.text.x = element_text(angle = 45, hjust = 1), + legend.position = "right" + ) + +ggsave("plots/publication/model_performance_across_metrics.pdf", + device = "pdf", + scale = 2, + width = 18, + height = 14, + units = "cm") + +## Performance vs number of records #### +obs_count = model_data %>% + sf::st_drop_geometry() %>% + dplyr::filter(present == 1, !is.na(fold_eval)) %>% + dplyr::group_by(species) %>% + dplyr::summarise(obs = n()) + +df_plot = performance %>% + dplyr::filter(metric %in% c("auc", "f1", "kappa", "accuracy", "precision", "recall")) %>% + dplyr::left_join(obs_count, by = "species") + +ggplot(df_plot, aes(x = obs, y = value, color = model, fill = model)) + + geom_point(alpha = 0.2) + + geom_smooth() + + facet_wrap(~ metric, scales = "free_y") + + scale_x_continuous(trans = "log10") + + labs( + title = "Model Performance vs Number of Records", + x = "Number of Records (log scale)", + y = "Value", + color = "Model", + fill = "Model" + ) + + theme_minimal(base_size = 14) + + theme( + strip.text = element_text(face = "bold"), + axis.text.x = element_text(angle = 45, hjust = 1), + legend.position = "right" + ) + +ggsave("plots/publication/model_performance_vs_number_of_records.pdf", + device = "pdf", + scale = 2, + width = 18, + height = 14, + units = "cm") + +## Performance vs functional group #### +load("data/r_objects/functional_groups.RData") + +df_plot = performance %>% + dplyr::filter(metric %in% c("auc", "f1", "kappa")) %>% + dplyr::left_join(functional_groups, by = c("species" = "name_matched")) + +ggplot(df_plot, aes(x = model, y = value, color = model)) + + geom_boxplot(alpha = 0.5, outlier.shape = 16) + + facet_grid(rows = vars(metric), cols = vars(functional_group), scales = "free_y", switch = "both") + + labs( + title = "Model Performance Across Functional Groups", + x = "Model", + y = "Value", + color = "Model", + ) + + theme_minimal(base_size = 14) + + theme( + strip.text = element_text(face = "bold"), + panel.border = element_rect(color = "gray", fill = NA), + axis.text.x = element_text(angle = 45, hjust = 1), + legend.position = "right" + ) + +ggsave("plots/publication/model_performance_vs_functional_groups.pdf", + device = "pdf", + scale = 2, + width = 18, + height = 14, + units = "cm") + +# ------------------------------------------------------------------ # +# 3. Range predictions #### +# ------------------------------------------------------------------ # +# Define plotting function +plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "nn", "msdm")){ + # Species data + load("data/r_objects/range_maps.RData") + + pa_spec = model_data %>% + dplyr::filter(species == !!spec) + range_spec = range_maps %>% + dplyr::filter(name_matched == !!spec) %>% + sf::st_transform(sf::st_crs(pa_spec)) + + # Extract raster values into df + bbox_spec = sf::st_bbox(pa_spec) %>% + expand_bbox(expansion = 0.25) + + raster_crop = terra::crop(raster_data, bbox_spec) + new_data = raster_crop %>% + terra::values(dataframe = T, na.rm = T) %>% + dplyr::mutate(species = unique(pa_spec$species)) + + # Make predictions + for(algorithm in algorithms){ + message(algorithm) + # Load model + tryCatch({ + if(algorithm == "msdm"){ + load("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData") + probabilities = predict(msdm_embed_fit, new_data, type = "response")[,1] + predictions = factor(round(probabilities), levels = c("0", "1"), labels = c("A", "P")) + } else if(algorithm == "nn") { + load(paste0("data/r_objects/ssdm_results/models/", spec, "_nn_fit.RData")) + probabilities = predict(nn_fit, new_data, type = "response")[,1] + predictions = factor(round(probabilities), levels = c("0", "1"), labels = c("A", "P")) + } else { + load(paste0("data/r_objects/ssdm_results/models/", spec, "_", algorithm, "_fit.RData")) + predictions = predict(get(paste0(algorithm, "_fit")), new_data, type = "raw") + } + }, error = function(e){ + warning(toupper(algorithm), ": Model could not be loaded.") + }) + + raster_pred = terra::rast(raster_crop, nlyrs = 1) + values(raster_pred)[as.integer(rownames(new_data))] <- predictions + + # Plot + plot(raster_pred, + type = "classes", + levels = c("Absent", "Present"), + main = paste0("Range prediction (", toupper(algorithm), "): ", spec)) + point_colors = sapply(pa_spec$present, function(x) ifelse(x == 0, "black", "white")) + plot(pa_spec[,"present"], col = point_colors, add = T, pch = 16) + plot(range_spec, border = "white", lwd = 2, col = NA, add = T) + } +} + +# Load raster +raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) +raster_data = terra::rast(raster_filepaths) %>% + terra::crop(sf::st_bbox(sa_polygon)) %>% + terra::project(sf::st_crs(model_data)$input) + +spec = "Thyroptera tricolor" +pdf(file = paste0("plots/range_predictions/", spec, ".pdf")) +plot_predictions(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "msdm")) +dev.off() + +# ------------------------------------------------------------------ # +# 5. Compare msdm predictions #### +# ------------------------------------------------------------------ # +# Check predictions for different species +load("data/r_objects/msdm_embed_results/msdm_embed_fit.RData") +specs = sample(unique(model_data$species), size = 2, replace = F) + +new_data = spatSample(raster_data, 1000, replace = F, as.df = T) %>% + drop_na() + +new_data1 = dplyr::mutate(new_data, species = specs[1]) +new_data2 = dplyr::mutate(new_data, species = specs[2]) + +pred1 = predict(msdm_embed_fit, new_data1, type = "response") +pred2 = predict(msdm_embed_fit, new_data2, type = "response") + +pred1 == pred2 + +# ---> identical predictions + +# Permute embeddings +embeddings = coef(msdm_embed_fit)[[1]][[1]] +msdm_embed_fit$net$e_1$parameters$weight$set_data(torch::torch_tensor(embeddings[sample.int(nrow(embeddings)), ], device = msdm_embed_fit$net$e_1$parameters$weight$device, dtype = msdm_embed_fit$net$e_1$parameters$weight$dtype )) + +# ---> Invalid external pointer + +# ------------------------------------------------------------------ # +# 6. Compare species embeddings #### +# ------------------------------------------------------------------ # +all_embeddings = lapply(1:5, function(fold){ + load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData")) + coef(msdm_embed_fit)[[1]][[1]] +}) + +pairs = combn(1:5, 2, simplify = F) +pairwise_rv = lapply(pairs, function(pair){ + FactoMineR::coeffRV(all_embeddings[[pair[1]]], all_embeddings[[pair[2]]]) +}) + +rv_matrix = matrix(NA, 5, 5) +for(i in seq_along(pairs)){ + rv_matrix[pairs[[i]][2], pairs[[i]][1]] = pairwise_rv[[i]][["rv"]] +} + +p_matrix = matrix(NA, 5, 5) +for(i in seq_along(pairs)){ + p_matrix[pairs[[i]][2], pairs[[i]][1]] = pairwise_rv[[i]][["p.value"]] +} + +# ------------------------------------------------------------------ # +# 7. Analyse species embeddings #### +# ------------------------------------------------------------------ # +embeddings = coef(msdm_fit_embedding_raw)[[1]][[1]] +species_lookup = msdm_fit_embedding_raw$data$data[,c("species", "species_int")] %>% + distinct() %>% + mutate(genus = stringr::str_extract(rownames(embeddings), "([a-zA-Z]+) (.*)", group = 1)) + +rownames(embeddings) = species_lookup$species + +## Dimensionality reduction #### +### PCA #### +pca_result = prcomp(t(embeddings), scale. = TRUE) +var_explained = pca_result$sdev^2 / sum(pca_result$sdev^2) +plot(var_explained) + +# --> Variance explained is distributed rather evenly across dimensions +# --> Dimensionality reduction probably not useful + +coords = pca_result$rotation[,1:2] %>% + magrittr::set_colnames(c("X", "Y")) + +df_plot = species_lookup %>% + left_join(functional_groups, by = c("species" = "name_matched")) %>% + bind_cols(coords) + +ggplot(df_plot, aes(x = X, y = Y, col = functional_group, label=genus)) + + geom_point() + + geom_text(hjust=0, vjust=0) + + guides(col="none") + +# --> PCA not informative on 2 dimensions either + +### TSNE #### +tsne_result <- Rtsne(embeddings, verbose = FALSE) +coords = tsne_result$Y %>% + magrittr::set_colnames(c("X", "Y")) + +df_plot = species_lookup %>% + left_join(functional_groups, by = c("species" = "name_matched")) %>% + bind_cols(coords) + +ggplot(df_plot, aes(x = X, y = Y, col = functional_group, label=genus)) + + geom_point() + + geom_text(hjust=0, vjust=0) + + guides(col="none") + +# --> TSNE not informative on 2 dimensions either + + +### KNN #### +embeddings_dist = as.matrix(dist(embeddings)) + +k = 9 +knn_results = sapply(as.character(species_lookup$species), FUN = function(spec){ + return(sort(embeddings_dist[spec,])[2:(k+1)]) +}, USE.NAMES = T, simplify = F) + +plot_knn_ranges = function(spec){ + spec_df = model_data %>% + dplyr::filter(species == !!spec) + + p_spec_range = ggplot() + + geom_sf(data = st_as_sf(sa_polygon)) + + geom_sf(data = spec_df) + + ggtitle(paste0(spec)) + + theme_minimal() + + knn_df = model_data %>% + dplyr::filter(species %in% c(names(knn_results[[spec]]))) + + p_knn_ranges = ggplot() + + geom_sf(data = st_as_sf(sa_polygon)) + + geom_sf(data = knn_df, aes(color = species)) + + facet_wrap(facets = "species", ncol = 3) + + theme_minimal() + + guides(color="none") + + p_spec_range + p_knn_ranges +} + +plot_knn_ranges(sample(species_lookup$species, 1)) # Repeat this line to plot random species + +# Clustering +embeddings_dist = dist(embeddings, method = "euclidean") +hclust_complete = hclust(dist_matrix, method = "complete") +plot(hclust_complete, main = "Complete Linkage", xlab = "", sub = "", labels = FALSE) + +# Phylo Correlation +load("data/r_objects/phylo_dist.RData") + +species_intersect = intersect(colnames(phylo_dist), species_lookup$species) + +phylo_dist_subset = phylo_dist[species_intersect, species_intersect] + +e_indices = match(species_intersect, species_lookup$species) +embeddings_dist_subset = as.matrix(embeddings_dist)[e_indices, e_indices] +FactoMineR::coeffRV(embeddings_dist_subset, phylo_dist_subset) + +# Trait Correlation +load("data/r_objects/func_dist.RData") + +species_intersect = intersect(colnames(func_dist), species_lookup$species) + +func_dist_subset = func_dist[species_intersect, species_intersect] + +e_indices = match(species_intersect, species_lookup$species) +embeddings_dist_subset = as.matrix(embeddings_dist)[e_indices, e_indices] +FactoMineR::coeffRV(embeddings_dist_subset, func_dist_subset) + +# Range Correlation +load("data/r_objects/range_dist.RData") + +species_intersect = intersect(colnames(range_dist), species_lookup$species) + +range_dist_subset = range_dist[species_intersect, species_intersect] + +e_indices = match(species_intersect, species_lookup$species) +embeddings_dist_subset = as.matrix(embeddings_dist)[e_indices, e_indices] +FactoMineR::coeffRV(embeddings_dist_subset, range_dist_subset) + diff --git a/R/_publish.yml b/R/_publish.yml index c07bacdf4dd08df0543df4ec56e167e482e70c0d..9d5a5d092003eaf2a866a8e9074a6dc5e49f504a 100644 --- a/R/_publish.yml +++ b/R/_publish.yml @@ -6,3 +6,7 @@ quarto-pub: - id: f9cafa63-bfd4-4c00-b401-09ec137bd3ce url: 'https://chrkoenig.quarto.pub/sdm-performance-carsten' +- source: 05_01_performance_analysis.qmd + quarto-pub: + - id: cfaa8a4d-31e3-410c-ba1a-399aad5582fd + url: 'https://chrkoenig.quarto.pub/sdm-performance-analysis-8272' diff --git a/R/utils.R b/R/utils.R index 265feed61ea02923661fa8b0225fa5c39b35e929..b433a44f336ebd9987c8ca3671057c965b610411 100644 --- a/R/utils.R +++ b/R/utils.R @@ -17,10 +17,10 @@ expand_bbox <- function(bbox, min_span = 1, expansion = 0.25) { } # Expand the limits, adjusting both directions correctly - bbox["xmin"] <- max(bbox["xmin"] - (x_expand * x_range), -180) - bbox["xmax"] <- min(bbox["xmax"] + (x_expand * x_range), 180) - bbox["ymin"] <- max(bbox["ymin"] - (y_expand * y_range), -90) - bbox["ymax"] <- min(bbox["ymax"] + (y_expand * y_range), 90) + bbox["xmin"] <- bbox["xmin"] - (x_expand * x_range) + bbox["xmax"] <- bbox["xmax"] + (x_expand * x_range) + bbox["ymin"] <- bbox["ymin"] - (y_expand * y_range) + bbox["ymax"] <- bbox["ymax"] + (y_expand * y_range) return(bbox) } @@ -40,8 +40,8 @@ evaluate_model <- function(model, data) { # Predict probabilities if(class(model) %in% c("citodnn", "citodnnBootstrap")){ - data = dplyr::select(data, any_of(all.vars(msdm_fit_embedding_raw$old_formula))) - probs = predict(model, as.matrix(data), type = "response")[,1] + data = dplyr::select(data, any_of(all.vars(model$old_formula))) + probs = predict(model, data, type = "response")[,1] preds = factor(round(probs), levels = c("0", "1"), labels = c("A", "P")) } else { probs = predict(model, data, type = "prob")$P @@ -64,7 +64,11 @@ evaluate_model <- function(model, data) { Kappa = cm$overall["Kappa"], Precision = cm$byClass["Precision"], Recall = cm$byClass["Recall"], - F1 = cm$byClass["F1"] + F1 = cm$byClass["F1"], + TP = cm$table["P", "P"], + FP = cm$table["P", "A"], + TN = cm$table["A", "A"], + FN = cm$table["A", "P"] ) ) } diff --git a/README.md b/README.md index 7ea2759dc6d18cd05ab3c2db1441537c5094ec92..aaaa50cd194bb8d2807ab511b9ca8845a4352836 100644 --- a/README.md +++ b/README.md @@ -1,93 +1,3 @@ # Symobio-modeling - - -## Getting started - -To make it easy for you to get started with GitLab, here's a list of recommended next steps. - -Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)! - -## Add your files - -- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files -- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command: - -``` -cd existing_repo -git remote add origin https://git.idiv.de/ye87zine/symobio-modeling.git -git branch -M main -git push -uf origin main -``` - -## Integrate with your tools - -- [ ] [Set up project integrations](https://git.idiv.de/ye87zine/symobio-modeling/-/settings/integrations) - -## Collaborate with your team - -- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/) -- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html) -- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically) -- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/) -- [ ] [Set auto-merge](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html) - -## Test and Deploy - -Use the built-in continuous integration in GitLab. - -- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html) -- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing (SAST)](https://docs.gitlab.com/ee/user/application_security/sast/) -- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html) -- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/) -- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html) - -*** - -# Editing this README - -When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thanks to [makeareadme.com](https://www.makeareadme.com/) for this template. - -## Suggestions for a good README - -Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information. - -## Name -Choose a self-explaining name for your project. - -## Description -Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors. - -## Badges -On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge. - -## Visuals -Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method. - -## Installation -Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection. - -## Usage -Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README. - -## Support -Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc. - -## Roadmap -If you have ideas for releases in the future, it is a good idea to list them in the README. - -## Contributing -State if you are open to contributions and what your requirements are for accepting them. - -For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self. - -You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser. - -## Authors and acknowledgment -Show your appreciation to those who have contributed to the project. - -## License -For open source projects, say how it is licensed. - -## Project status -If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers. +Code and data accompanying SSDM vs MSDM model comparison \ No newline at end of file diff --git a/snippets.R b/snippets.R new file mode 100644 index 0000000000000000000000000000000000000000..40f55239594d109d1edeab09724070d40dab631c --- /dev/null +++ b/snippets.R @@ -0,0 +1,160 @@ +### Range coverage bias + +Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range. + +$$ + RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)} + $$ + + Higher bias values indicate that occurrence records are spatially more clustered within the range of the species. + + - There was no strong relationship between range coverage bias and model performance + + ```{r range_coverage_bias, echo = FALSE, message=FALSE, warnings=FALSE, eval=F} + df_occs_total = occs_final %>% + st_drop_geometry() %>% + group_by(species) %>% + summarise(occs_total = n()) + + df_join = df_occs_total %>% + dplyr::inner_join(df_cells_total, by = "species") %>% + dplyr::inner_join(df_cells_occ, by = "species") %>% + dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1))) + + df_plot = performance %>% + inner_join(df_join, by = "species") + + # Calculate regression lines for each model and metric combination + suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~lin_regression(.x$range_bias, .x$value)) + }) + + # Create base plot + plot <- plot_ly() %>% + layout( + title = "Model Performance vs. Range coverage bias", + xaxis = list(title = "Range coverage bias"), + yaxis = list(title = "Value"), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest', + updatemenus = list( + list( + type = "dropdown", + active = 0, + buttons = plotly_buttons + ) + ) + ) + + # Add regression lines and confidence intervals for each model + for (model_name in unique(df_plot$model)) { + plot <- plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~ range_bias, + y = ~ value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Range coverage bias:", range_bias, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) + } + + for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE, + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) + } + + + bslib::card(plot, full_screen = T) + ``` + + ### Functional group + + Functional groups were assigned based on taxonomic order. The following groupings were used: + + | Functional group | Taxomic orders | + |-------------------|-----------------------------------------------------| + | large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla | + | small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha | + | arboreal | Primates, Pilosa | + | flying | Chiroptera | + + - Models for bats tended to perform slightly worse than for other groups. + + ```{r functional_groups, echo = FALSE, message=FALSE, warnings=FALSE, eval=F} + df_plot = performance %>% + dplyr::left_join(functional_groups, by = c("species" = "name_matched")) + + plot <- plot_ly( + data = df_plot, + x = ~functional_group, + y = ~value, + color = ~model, + type = 'box', + boxpoints = "all", + jitter = 1, + pointpos = 0, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Functional group:", functional_group, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] # default value + ) + ) + ) + + plot <- plot %>% + layout( + title = "Model Performance vs. Functional Group", + xaxis = list(title = "Functional group"), + yaxis = list(title = "Value"), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest', + boxmode = "group", + updatemenus = list( + list( + type = "dropdown", + active = 0, + buttons = plotly_buttons + ) + ) + ) + + bslib::card(plot, full_screen = T) + ``` + \ No newline at end of file