diff --git a/R/03_presence_absence_preparation.R b/R/03_presence_absence_preparation.R index fe78ceaffe66603f17bb63acc246dd5704808ea4..9fd817c71fa460cf27c87aa9d023296999392b67 100644 --- a/R/03_presence_absence_preparation.R +++ b/R/03_presence_absence_preparation.R @@ -81,7 +81,7 @@ occs_final = occs %>% save(occs_final, file = "data/r_objects/occs_final.RData") # ---------------------------------------------------------------------------# -# Create SSDM dataset #### +# Create Modeling dataset #### # ---------------------------------------------------------------------------# load("data/r_objects/occs_final.RData") load("data/r_objects/sa_polygon.RData") @@ -89,13 +89,8 @@ sf::sf_use_s2(use_s2 = FALSE) occs_split = split(occs_final, occs_final$species) -future::plan("multisession", workers = 8) -ssdm_data = furrr::future_map(occs_split, .progress = TRUE, .options = furrr::furrr_options(seed = 42), .f = function(occs_spec){ - # skip low information species - if(nrow(occs_spec) < 5){ - return(NULL) - } - +future::plan("multisession", workers = 16) +model_data = furrr::future_map(occs_split, .progress = TRUE, .options = furrr::furrr_options(seed = 42), .f = function(occs_spec){ # Define model/sampling region occs_bbox = occs_spec %>% sf::st_bbox() %>% @@ -132,77 +127,34 @@ ssdm_data = furrr::future_map(occs_split, .progress = TRUE, .options = furrr::fu dplyr::mutate(present = 1) %>% bind_rows(abs_spec) - # Define cross-validation folds - spatial_folds = blockCV::cv_spatial( - pa_spec, - column = "present", - k = 5, - progress = F, plot = F, report = F - ) - - pa_spec$fold = spatial_folds$folds_ids - pa_spec$geometry = NULL - # Split into train and test datasets train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE) pa_spec$train = 0 pa_spec$train[train_index] = 1 - return(pa_spec) -}) - -ssdm_data = bind_rows(ssdm_data) -save(ssdm_data, file = "data/r_objects/ssdm_data.RData") - -# ---------------------------------------------------------------------------# -# Create MSDM dataset #### -# ---------------------------------------------------------------------------# -load("data/r_objects/occs_final.RData") -load("data/r_objects/sa_polygon.RData") -sf::sf_use_s2(use_s2 = FALSE) - -occs_split = split(occs_final, occs_final$species) - -future::plan("multisession", workers = 8) -msdm_data = furrr::future_map(occs_split, .progress = TRUE, .options = furrr::furrr_options(seed = 42), .f = function(occs_spec){ - # Define model/sampling region - occs_bbox = occs_spec %>% - sf::st_bbox() %>% - expand_bbox(min_span = 1, expansion = 0.25) %>% - sf::st_as_sfc() %>% - st_set_crs(st_crs(occs_spec)) - - sample_region = suppressMessages( - st_as_sf(st_intersection(occs_bbox, sa_polygon)) - ) - - # Sample pseudo absence - sample_points = st_as_sf(st_sample(sample_region, nrow(occs_spec))) %>% - dplyr::mutate( - species = occs_spec$species[1], - longitude = sf::st_coordinates(.)[,1], - latitude = sf::st_coordinates(.)[,2] - ) %>% - dplyr::rename(geometry = "x") - - abs_spec = terra::rast(raster_filepaths) %>% - setNames(paste0("layer_", 1:19)) %>% - terra::extract(sample_points) %>% - dplyr::select(-ID) %>% - dplyr::mutate( - present = 0, - geometry = sample_points$x - ) %>% - tibble() %>% - bind_cols(sample_points) + # Define cross-validation folds + folds = tryCatch({ + spatial_folds = suppressMessages( + blockCV::cv_spatial( + pa_spec, + column = "present", + k = 5, + progress = F, plot = F, report = F + ) + ) + + spatial_folds$folds_ids + }, warning = function(w){ + NA + }, error = function(e){ + NA + }) - # Create presence-absence dataframe - pa_spec = occs_spec %>% - dplyr::mutate(present = 1) %>% - bind_rows(abs_spec) + pa_spec$folds = folds + pa_spec$geometry = NULL return(pa_spec) }) -msdm_data = bind_rows(msdm_data) -save(ssdm_data, file = "data/r_objects/msdm_data.RData") +model_data = bind_rows(model_data) +save(model_data, file = "data/r_objects/model_data.RData") diff --git a/R/04_01_ssdm_modeling.R b/R/04_01_ssdm_modeling.R index 22e35e6d3ecff5b2a26397c68074d29f583ea825..bdc7c9f0df3f6acd185a28f872cf5ce97d106cb9 100644 --- a/R/04_01_ssdm_modeling.R +++ b/R/04_01_ssdm_modeling.R @@ -7,23 +7,23 @@ library(pROC) source("R/utils.R") -load("data/r_objects/ssdm_data.RData") +load("data/r_objects/model_data.RData") -pa_split = split(ssdm_data, ssdm_data$species) +data_split = split(model_data, model_data$species) # ----------------------------------------------------------------------# # Train models #### # ----------------------------------------------------------------------# future::plan("multisession", workers = 8) -ssdm_results = furrr::future_map(pa_split, .options = furrr::furrr_options(seed = 123), .f = function(pa_spec){ +ssdm_results = furrr::future_map(data_split, .options = furrr::furrr_options(seed = 123), .f = function(data_spec){ # Initial check - if(nrow(pa_spec) < 10){ + if(nrow(data_spec) < 10 | anyNA(data_spec$folds)){ return(NULL) } - pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P")) - train_data = dplyr::filter(pa_spec, train == 1) - test_data = dplyr::filter(pa_spec, train == 0) + data_spec$present_fct = factor(data_spec$present, levels = c("0", "1"), labels = c("A", "P")) + train_data = dplyr::filter(data_spec, train == 1) + test_data = dplyr::filter(data_spec, train == 0) # Define empty result for performance eval na_performance = list( @@ -137,8 +137,8 @@ ssdm_results = furrr::future_map(pa_split, .options = furrr::furrr_options(seed # Summarize results performance_summary = tibble( - species = pa_spec$species[1], - obs = nrow(pa_spec), + species = data_spec$species[1], + obs = nrow(data_spec), model = c("RF", "GBM", "GLM", "NN"), auc = c(rf_performance$AUC, gbm_performance$AUC, glm_performance$AUC, nn_performance$AUC), accuracy = c(rf_performance$Accuracy, gbm_performance$Accuracy, glm_performance$Accuracy, nn_performance$Accuracy), diff --git a/R/04_02_msdm_modeling.R b/R/04_02_msdm_modeling.R new file mode 100644 index 0000000000000000000000000000000000000000..84f9ba5c2cd4faab3ce35ebc45b2331397ac0fd4 --- /dev/null +++ b/R/04_02_msdm_modeling.R @@ -0,0 +1,80 @@ +library(dplyr) +library(tidyr) +library(cito) + +source("R/utils.R") + +load("data/r_objects/model_data.RData") + +model_data$species_int = as.integer(as.factor(model_data$species)) + +train_data = dplyr::filter(model_data, train == 1) +test_data = dplyr::filter(model_data, train == 0) + +# ----------------------------------------------------------------------# +# Train model #### +# ----------------------------------------------------------------------# +predictors = paste0("layer_", 1:19) +formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, dim = 20, lambda = 0.000001)")) + +msdm_fit = dnn( + formula, + data = train_data, + hidden = c(500L, 1000L, 1000L), + loss = "binomial", + activation = c("sigmoid", "leaky_relu", "leaky_relu"), + epochs = 10000L, + lr = 0.01, + baseloss = 1, + batchsize = nrow(train_data)/5, + dropout = 0.25, + burnin = 100, + optimizer = config_optimizer("adam", weight_decay = 0.001), + lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), + early_stopping = 250, + validation = 0.3, + device = "cuda" +) + +save(msdm_fit, file = "data/r_objects/msdm_fit.RData") + +# ----------------------------------------------------------------------# +# Evaluate model #### +# ----------------------------------------------------------------------# +load("data/r_objects/msdm_fit.RData") + +# Overall +preds_train = predict(msdm_fit, newdata = as.matrix(train_data), type = "response") +preds_test = predict(msdm_fit, newdata = as.matrix(test_data), type = "response") + +hist(preds_train) +hist(preds_test) + +eval_overall = evaluate_model(msdm_fit, test_data) + +# Per species +data_split = split(model_data, model_data$species) + +msdm_results = lapply(data_split, function(data_spec){ + test_data = dplyr::filter(data_spec, train == 0) + + msdm_performance = tryCatch({ + evaluate_model(msdm_fit, test_data) + }, error = function(e){ + list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA) + }) + + performance_summary = tibble( + species = data_spec$species[1], + obs = nrow(data_spec), + model = "NN_MSDM", + auc = msdm_performance$AUC, + accuracy = msdm_performance$Accuracy, + kappa = msdm_performance$Kappa, + precision = msdm_performance$Precision, + recall = msdm_performance$Recall, + f1 = msdm_performance$F1 + ) +}) %>% bind_rows() + +save(msdm_results, file = "data/r_objects/msdm_results.RData") diff --git a/R/04_distribution_modeling.R b/R/04_distribution_modeling.R deleted file mode 100644 index 9baa270cef154373dd3abab92af9360363afa635..0000000000000000000000000000000000000000 --- a/R/04_distribution_modeling.R +++ /dev/null @@ -1,217 +0,0 @@ -# ---------------------------------------------------------------------------# -# Main loop #### -# ---------------------------------------------------------------------------# -load("data/r_objects/occs_final.RData") -load("data/r_objects/sa_polygon.RData") - -occs_split = split(occs_final, occs_final$species) - -future::plan("multisession", workers = 20) - -ssdm_data = furrr::future_map(occs_split[1:20], .options = furrr::furrr_options(seed = 123), .f = function(occs_spec){ - # Initial check - if(nrow(occs_spec) < 10){ - return(NULL) - } - - species = occs_spec$species[1] - - # ------------------------------- # - # Define model/sampling region #### - # ------------------------------- # - sample_region = tryCatch({ - sf::sf_use_s2(use_s2 = FALSE) - - occs_bbox = occs_spec %>% - sf::st_bbox() %>% - expand_bbox(0.25) %>% - sf::st_as_sfc() %>% - st_set_crs(st_crs(occs_spec)) - - sample_region = suppressMessages( - st_as_sf(st_intersection(occs_bbox, sa_polygon)) - ) - }, error = function(e){ - glue::glue("Skipping species {species}:\n{e}") - }) - - if(is.character(sample_region)){ # There was an error - return(sample_region) - } - - # ------------------------------- # - # Create pseudo absence #### - # ------------------------------- # - sample_points = st_as_sf(st_sample(sample_region, nrow(occs_spec))) # TODO: Vary sample size? - - abs_spec = terra::rast(raster_filepaths) %>% - setNames(paste0("layer_", 1:19)) %>% - terra::extract(sample_points) %>% - dplyr::select(-ID) %>% - dplyr::mutate( - presence = 0, - geometry = sample_points$x - ) %>% - tibble() - - # ------------------------------- # - # Create modeling dataset #### - # ------------------------------- # - model_data = occs_spec %>% - dplyr::mutate(presence = 1) %>% - bind_rows(abs_spec) %>% - dplyr::mutate( - presence_fct = as.factor(ifelse(presence == 0, "absent", "present")) - ) - - # Define cross-validation folds - spatial_folds = blockCV::cv_spatial( - model_data, - column = "presence", - k = 5, - progress = F, plot = F, report = F - ) - - model_data$fold = spatial_folds$folds_ids - model_data$geometry = NULL - - # Split into train and test datasets - train_index = createDataPartition(model_data$presence, p = 0.7, list = FALSE) - train_data = model_data[train_index, ] - test_data = model_data[-train_index, ] - - # Define predictor columns - predictors = paste0("layer_", 1:19) - - # Define empty performance summary - na_performance = list( - AUC = NA, - Accuracy = NA, - Kappa = NA, - Precision = NA, - Recall = NA, - F1 = NA - ) - - # ------------------------------- # - # Train models #### - # ------------------------------- # - ## cito #### - config_lr_scheduler() - - nn_performance = tryCatch({ - nn_fit = dnn( - X = train_data[, predictors], - Y = train_data$presence, - hidden = c(500L, 200L, 50L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 500L, - lr = 0.02, - baseloss=10, - batchsize=nrow(train_data)/4, - dropout = 0.1, # Regularization - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 200, # stop training when validation loss does not decrease anymore - validation = 0.3, # used for early stopping and lr_scheduler - device = "cuda", - bootstrap = 5 - ) - - evaluate_model(nn_fit, test_data) - }, error = function(e){ - na_performance - }) - - ## caret #### - # Define training routine - index_train = lapply(unique(sort(train_data$fold)), function(x){ - return(which(train_data$fold != x)) - }) - - train_ctrl = trainControl( - search = "grid", - classProbs = TRUE, - index = index_train, - summaryFunction = twoClassSummary, - savePredictions = "final" - ) - - # Random Forest - rf_performance = tryCatch({ - rf_grid = expand.grid( - mtry = c(3,7,11,15,19) # Number of randomly selected predictors - ) - - rf_fit = caret::train( - x = train_data[, predictors], - y = train_data$presence_fct, - method = "rf", - metric = "ROC", - tuneGrid = rf_grid, - trControl = train_ctrl - ) - evaluate_model(rf_fit, test_data) - }, error = function(e){ - na_performance - }) - - # Gradient Boosted Machine - gbm_performance = tryCatch({ - gbm_grid <- expand.grid( - n.trees = c(100, 500, 1000, 1500), # Higher number of boosting iterations - interaction.depth = c(3, 5, 7), # Maximum depth of each tree - shrinkage = c(0.01, 0.005, 0.001), # Lower learning rates - n.minobsinnode = c(10, 20) # Minimum number of observations in nodes - ) - - gbm_fit = train( - x = train_data[, predictors], - y = train_data$presence_fct, - method = "gbm", - metric = "ROC", - verbose = F, - tuneGrid = gbm_grid, - trControl = train_ctrl - ) - evaluate_model(gbm_fit, test_data) - }, error = function(e){ - na_performance - }) - - # Generalized additive Model - glm_performance = tryCatch({ - glm_fit = train( - x = train_data[, predictors], - y = train_data$presence_fct, - method = "glm", - family=binomial, - metric = "ROC", - preProcess = c("center", "scale"), - trControl = train_ctrl - ) - evaluate_model(glm_fit, test_data) - }, error = function(e){ - na_performance - }) - - # Summarize results - performance_summary = tibble( - species = species, - obs = nrow(occs_spec), - model = c("NN", "RF", "GBM", "GLM"), - auc = c(nn_performance$AUC, rf_performance$AUC, gbm_performance$AUC, glm_performance$AUC), - accuracy = c(nn_performance$Accuracy, rf_performance$Accuracy, gbm_performance$Accuracy, glm_performance$Accuracy), - kappa = c(nn_performance$Kappa, rf_performance$Kappa, gbm_performance$Kappa, glm_performance$Kappa), - precision = c(nn_performance$Precision, rf_performance$Precision, gbm_performance$Precision, glm_performance$Precision), - recall = c(nn_performance$Recall, rf_performance$Recall, gbm_performance$Recall, glm_performance$Recall), - f1 = c(nn_performance$F1, rf_performance$F1, gbm_performance$F1, glm_performance$F1) - ) - - return(performance_summary) -}) - -model_data_df = bind_rows(model_data) - -save(model_results, file = "data/r_objects/model_results.RData") \ No newline at end of file diff --git a/R/05_performance_analysis.qmd b/R/05_performance_analysis.qmd index a481b1d7032b42d65ce7cebb1a9b5f03aa5c83b4..c2d7d442fe8689080c38bbbdc8acf5e926ab8c2a 100644 --- a/R/05_performance_analysis.qmd +++ b/R/05_performance_analysis.qmd @@ -13,6 +13,7 @@ library(plotly) library(DT) load("../data/r_objects/ssdm_results.RData") +load("../data/r_objects/msdm_results.RData") load("../data/r_objects/range_maps.RData") load("../data/r_objects/range_maps_gridded.RData") load("../data/r_objects/occs_final.RData") @@ -84,7 +85,7 @@ Generalized Linear Model - Logistic model with binomial link function -Neural Netwok +Neural Netwok (single-species) - Three hidden layers (sigmoid - 500, leaky_relu - 200, leaky_relu - 50) @@ -92,6 +93,8 @@ Neural Netwok - Adam optimizer +Neural Network (multi-species) + ### Key findings: - Performance: RF \> GBM \> GLM \> NN @@ -105,7 +108,7 @@ Neural Netwok The table below shows the analysed modeling results. ```{r performance, echo = FALSE, message=FALSE, warnings=FALSE} -performance = ssdm_results %>% +performance = bind_rows(ssdm_results, msdm_results) %>% pivot_longer(c(auc, accuracy, kappa, precision, recall, f1), names_to = "metric") %>% dplyr::filter(!is.na(value)) %>% dplyr::mutate( @@ -504,7 +507,7 @@ bslib::card(plot, full_screen = T) Functional groups were assigned based on taxonomic order. The following groupings were used: | Functional group | Taxomic orders | -|-----------------------|-------------------------------------------------| +|------------------------|------------------------------------------------| | large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla | | small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha | | arboreal | Primates, Pilosa | diff --git a/R/05_performance_analysis.rmarkdown b/R/05_performance_analysis.rmarkdown new file mode 100644 index 0000000000000000000000000000000000000000..d82a62297406fbfbed633ecbdc7ecba1cbc828bf --- /dev/null +++ b/R/05_performance_analysis.rmarkdown @@ -0,0 +1,575 @@ +--- +title: "sSDM Performance analysis" +format: html +editor: visual +engine: knitr +--- + +```{r init, echo = FALSE, include = FALSE} +library(tidyverse) +library(Symobio) +library(sf) +library(plotly) +library(DT) + +load("../data/r_objects/ssdm_results.RData") +load("../data/r_objects/msdm_results.RData") +load("../data/r_objects/range_maps.RData") +load("../data/r_objects/range_maps_gridded.RData") +load("../data/r_objects/occs_final.RData") +load("../data/r_objects/functional_groups.RData") +sf::sf_use_s2(use_s2 = FALSE) +``` + +```{r globals, echo = FALSE, include = FALSE} +# Select metrics +focal_metrics = c("auc", "f1", "accuracy") # There's a weird bug in plotly that scrambles up lines when using more then three groups + +# Dropdown options +plotly_buttons = list() +for(metric in focal_metrics){ + plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", metric), label = metric) +} + +# Regression functions +asym_regression = function(x, y){ + nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) + new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) + data.frame( + x = new_x, + fit = predict(nls_fit, newdata = data.frame(x = new_x)) + ) +} + +lin_regression = function(x, y){ + glm_fit = suppressWarnings(glm(y~x, family = "binomial")) + new_x = seq(min(x), max(x), length.out = 100) + data.frame( + x = new_x, + fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") + ) +} + +``` + + +## Summary + +This document summarizes the performance of four SDM algorithms (Random Forest, Gradient Boosting Machine, Generalized Linear Model, Deep Neural Network) for `r I(length(unique(ssdm_results$species)))` South American mammal species. We use `r I(length(focal_metrics))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) to evaluate model performance and look at how performance varies with five factors (number of records, range size, range coverage, range coverage bias, and functional group). + +### Modeling decisions: + +#### Data + +- Randomly sampled pseudo-absences from expanded area of extent of occurrence records (×1.25) +- Balanced presences and absences for each species +- Predictors: all 19 CHELSA bioclim variables +- 70/30 Split of training vs. test data + +#### Algorithms + +Random Forest + +- Spatial block cross-validation during training + +- Hyperparameter tuning of `mtry` + +Generalized boosted maching + +- Spatial block cross-validation during training + +- Hyperparameter tuning across `n.trees` , `interaction.depth` , `shrinkage`, `n.minobsinnode` + +Generalized Linear Model + +- Spatial block cross-validation during training + +- Logistic model with binomial link function + +Neural Netwok (single-species) + +- Three hidden layers (sigmoid - 500, leaky_relu - 200, leaky_relu - 50) + +- Binomial loss + +- Adam optimizer + +Neural Network (multi-species) + +### Key findings: + +- Performance: RF \> GBM \> GLM \> NN +- Convergence problems with Neural Notwork Models +- More occurrence records and larger range sizes tended to improve model performance +- Higher range coverage correlated with better performance. +- Range coverage bias and functional group showed some impact but were less consistent + +## Analysis + +The table below shows the analysed modeling results. + + +```{r performance, echo = FALSE, message=FALSE, warnings=FALSE} +performance = bind_rows(ssdm_results, msdm_results) %>% + pivot_longer(c(auc, accuracy, kappa, precision, recall, f1), names_to = "metric") %>% + dplyr::filter(!is.na(value)) %>% + dplyr::mutate( + metric = factor(metric, levels = c("auc", "kappa", "f1", "accuracy", "precision", "recall")), + value = round(pmax(value, 0, na.rm = T), 3) # Fix one weird instance of f1 < 0 + ) %>% + dplyr::filter(metric %in% focal_metrics) + +DT::datatable(performance) +``` + + +### Number of records + +- Model performance was generally better for species with more observations +- Very poor performance below 50-100 observations + + +```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE} +df_plot = performance + +# Calculate regression lines for each model and metric combination +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~asym_regression(.x$obs, .x$value)) +}) + +# Create base plot +plot <- plot_ly() %>% + layout( + title = "Model Performance vs. Number of observations", + xaxis = list(title = "Number of observations", type = "log"), + yaxis = list(title = "Value"), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest', + updatemenus = list( + list( + type = "dropdown", + active = 0, + buttons = plotly_buttons + ) + ) + ) + +# Points +for (model_name in unique(df_plot$model)) { + plot = plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~obs, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Observations:", obs, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +# Add regression lines +for(model_name in unique(df_plot$model)){ + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE, + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +bslib::card(plot, full_screen = T) +``` + + +### Range characteristics + +#### Range size + +Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016). + +- Model performance tended to be slightly higher for species with larger range size +- Only RF shows continuous performance improvements beyond range sizes of \~5M km² + + +```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE} + +df_join = range_maps %>% + dplyr::mutate(range_size = as.numeric(st_area(range_maps) / 1000000)) %>% # range in sqkm + sf::st_drop_geometry() + +df_plot = performance %>% + inner_join(df_join, by = c("species" = "name_matched")) + +# Calculate regression lines for each model and metric combination +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~asym_regression(.x$range_size, .x$value)) +}) + +# Create base plot +plot <- plot_ly() %>% + layout( + title = "Model Performance vs. Range size", + xaxis = list(title = "Range size"), + yaxis = list(title = "Value"), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest', + updatemenus = list( + list( + type = "dropdown", + active = 0, + buttons = plotly_buttons + ) + ) + ) + +# Add regression lines and confidence intervals for each model +for (model_name in unique(df_plot$model)) { + plot <- plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~range_size, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Range size:", range_size, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE, + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +bslib::card(plot, full_screen = T) +``` + + +#### Range coverage + +Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells. + +$$ +RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}} +$$ + +- Models for species with higher range coverage showed slightly better performance + + +```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE} +df_cells_total = range_maps_gridded %>% + dplyr::rename("species" = name_matched) %>% + group_by(species) %>% + summarise(cells_total = n()) %>% + st_drop_geometry() + +df_cells_occ <- range_maps_gridded %>% + st_join(occs_final, join = st_intersects) %>% + filter(name_matched == species) %>% # Filter only intersections of the same species + group_by(species) %>% + summarise(cells_occupied = n_distinct(geometry)) %>% + st_drop_geometry() + +df_join = df_cells_total %>% + dplyr::inner_join(df_cells_occ, by = "species") %>% + dplyr::mutate(range_cov = cells_occupied / cells_total) %>% + dplyr::select(species, range_cov) + +df_plot = performance %>% + inner_join(df_join, by = "species") + +# Calculate regression lines for each model and metric combination +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~lin_regression(.x$range_cov, .x$value)) +}) + +# Create base plot +plot <- plot_ly() %>% + layout( + title = "Model Performance vs. Range coverage", + xaxis = list(title = "Range coverage"), + yaxis = list(title = "Value"), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest', + updatemenus = list( + list( + type = "dropdown", + active = 0, + buttons = plotly_buttons + ) + ) + ) + +# Add regression lines and confidence intervals for each model +for (model_name in unique(df_plot$model)) { + plot <- plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~range_cov, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Range coverage:", range_cov, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE, + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +bslib::card(plot, full_screen = T) +``` + + +#### Range coverage bias + +Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range. + +$$ +RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)} +$$ + +Higher bias values indicate that occurrence records are spatially more clustered within the range of the species. + +- There was no strong relationship between range coverage bias and model performance + + +```{r range_coverage_bias, echo = FALSE, message=FALSE, warnings=FALSE} +df_occs_total = occs_final %>% + st_drop_geometry() %>% + group_by(species) %>% + summarise(occs_total = n()) + +df_join = df_occs_total %>% + dplyr::inner_join(df_cells_total, by = "species") %>% + dplyr::inner_join(df_cells_occ, by = "species") %>% + dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1))) + +df_plot = performance %>% + inner_join(df_join, by = "species") + +# Calculate regression lines for each model and metric combination +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~lin_regression(.x$range_bias, .x$value)) +}) + +# Create base plot +plot <- plot_ly() %>% + layout( + title = "Model Performance vs. Range coverage bias", + xaxis = list(title = "Range coverage bias"), + yaxis = list(title = "Value"), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest', + updatemenus = list( + list( + type = "dropdown", + active = 0, + buttons = plotly_buttons + ) + ) + ) + +# Add regression lines and confidence intervals for each model +for (model_name in unique(df_plot$model)) { + plot <- plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~range_bias, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Range coverage bias:", range_bias, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE, + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + + +bslib::card(plot, full_screen = T) +``` + + +### Functional group + +Functional groups were assigned based on taxonomic order. The following groupings were used: + +| Functional group | Taxomic orders | +|------------------------|------------------------------------------------| +| large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla | +| small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha | +| arboreal | Primates, Pilosa | +| flying | Chiroptera | + +- Models for bats tended to perform slightly worse than for other groups. + + +```{r functional_groups, echo = FALSE, message=FALSE, warnings=FALSE} +df_plot = performance %>% + dplyr::left_join(functional_groups, by = c("species" = "name_matched")) + +plot <- plot_ly( + data = df_plot, + x = ~functional_group, + y = ~value, + color = ~model, + type = 'box', + boxpoints = "all", + jitter = 1, + pointpos = 0, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Functional group:", functional_group, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] # default value + ) + ) +) + +plot <- plot %>% + layout( + title = "Model Performance vs. Functional Group", + xaxis = list(title = "Functional group"), + yaxis = list(title = "Value"), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest', + boxmode = "group", + updatemenus = list( + list( + type = "dropdown", + active = 0, + buttons = plotly_buttons + ) + ) + ) + +bslib::card(plot, full_screen = T) +``` + diff --git a/R/utils.R b/R/utils.R index ce483b7fe58487db918f6170046344f72e9ef304..5539cfd5a9eed4da5e7aea07de93fac44ac676e2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -40,14 +40,21 @@ evaluate_model <- function(model, test_data) { # Predict probabilities if(class(model) %in% c("citodnn", "citodnnBootstrap")){ - probs <- predict(model, test_data, type = "response")[,1] + # This is a hack to fix naming issue during initial training run. Fix for later runs: + # ---- + test_data = test_data %>% + dplyr::select(-species) %>% + dplyr::rename(species = species_int) + # ---- + + probs <- predict(model, as.matrix(test_data), type = "response")[,1] preds <- factor(round(probs), levels = c("0", "1"), labels = c("A", "P")) } else { probs <- predict(model, test_data, type = "prob")$P preds <- predict(model, test_data, type = "raw") } - actual <- test_data$present_fct + actual <- factor(test_data$present, levels = c("0", "1"), labels = c("A", "P")) # Calculate AUC auc <- pROC::roc(actual, probs, levels = c("P", "A"), direction = ">")$auc @@ -56,12 +63,14 @@ evaluate_model <- function(model, test_data) { cm <- caret::confusionMatrix(preds, actual, positive = "P") # Return metrics - return(list( - AUC = auc, - Accuracy = cm$overall["Accuracy"], - Kappa = cm$overall["Kappa"], - Precision = cm$byClass["Precision"], - Recall = cm$byClass["Recall"], - F1 = cm$byClass["F1"] - )) + return( + list( + AUC = as.numeric(auc), + Accuracy = cm$overall["Accuracy"], + Kappa = cm$overall["Kappa"], + Precision = cm$byClass["Precision"], + Recall = cm$byClass["Recall"], + F1 = cm$byClass["F1"] + ) + ) }