--- title: "sSDM Performance analysis" format: html editor: visual --- ```{r init, echo = FALSE, include = FALSE} library(tidyverse) library(Symobio) library(sf) library(plotly) library(DT) load("../r_objects/model_results.RData") load("../r_objects/test_species.RData") load("../r_objects/range_maps.RData") load("../r_objects/range_maps_gridded.RData") load("../r_objects/occs_final.RData") sf::sf_use_s2(use_s2 = FALSE) ``` ## Summary This document summarizes the performance of three SDM algorithms (Random Forest, Gradient Boosting Machine, Generalized Linear Model) for 96 South American terrestrial mammal species. We use six metrics (AUC, F1, kappa, accuracy, precision, and recall) to evaluate model performance and look at how performance varies with five factors (number of records, range size, range occupancy, spatial dispersion, and functional group). Modeling decisions: - Randomly sampled pseudo-absences from expanded area of extent of occurrence records (×1.25) - Balanced presences and absences for each species - Predictors: all 19 CHELSA bioclim variables - Spatial block cross-validation - 70/30 Split of training vs. test data - Grid search hyperparameter tuning for RF and GBM Key findings: - RF and GBM models generally performed better than GLM across metrics - More occurrence records and larger range sizes tended to improve model accuracy - Higher range occupancy correlated with better performance. - Spatial dispersion and functional group showed some impact but were less consistent ## Analysis The table below shows the analysed modeling results. ```{r performance, echo = FALSE, message=FALSE, warnings=FALSE} performance = model_results %>% purrr::keep(inherits, 'data.frame') %>% bind_rows() %>% pivot_longer(c(auc, accuracy, kappa, precision, recall, f1), names_to = "metric") %>% dplyr::filter(!is.na(value)) %>% dplyr::mutate( metric = factor(metric, levels = c("auc", "kappa", "f1", "accuracy", "precision", "recall")), value = pmax(value, 0, na.rm = T) # Fix one weird instance of f1 < 0 ) DT::datatable(performance) ``` ### Number of records - Model performance was generally better for species with more observations - No major improvements in model performance beyond \~500 observations ```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE} df_plot = performance # Create base plot plot <- plot_ly( data = df_plot, x = ~obs, y = ~value, color = ~model, type = 'scatter', mode = 'markers', name = ~model, hoverinfo = 'text', text = ~paste("Species:", species, "<br>Obervations:", obs, "<br>Value:", round(value, 3)), transforms = list( list( type = 'filter', target = ~metric, operation = '=', value = 'auc' # default value ) ) ) # Add dropdown for selecting metric plot <- plot %>% layout( title = "Model Performance vs. Number of observations", xaxis = list(title = "Number of observations"), yaxis = list(title = "Value"), legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot margin = list(r = 150), # Add right margin to accommodate legend hovermode = 'closest', updatemenus = list( list( type = "dropdown", active = 0, buttons = list( list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") ) ) ) ) bslib::card(plot, full_screen = T) ``` ### Range characteristics The spatial coverage of species occurrences was evaluated using a systematic hexagonal grid system, with each cell spanning 1 degree in both latitudinal and longitudinal dimensions. Two complementary metrics were used to assess range coverage: First, sampling completeness was quantified by determining the number of grid cells containing at least one occurrence record (*occupancy*). Second, we calculated the theoretical maximum number of occupied hexagons possible given the total number of observations (*spatial dispersion*). #### Range size - Model performance was lower for small ranged species - No major improvements in model performance beyond range size of \~3M km² ```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE} df_join = range_maps %>% dplyr::mutate(range_size = as.numeric(st_area(range_maps) / 1000000)) %>% # range in sqkm sf::st_drop_geometry() df_plot = performance %>% inner_join(df_join, by = c("species" = "name_matched")) # Create base plot plot <- plot_ly( data = df_plot, x = ~range_size, y = ~value, color = ~model, type = 'scatter', mode = 'markers', name = ~model, hoverinfo = 'text', text = ~paste("Species:", species, "<br>Range size:", round(range_size, -3), "<br>Value:", round(value, 3)), transforms = list( list( type = 'filter', target = ~metric, operation = '=', value = 'auc' # default value ) ) ) # Add dropdown for selecting metric plot <- plot %>% layout( title = "Model Performance vs. Range size", xaxis = list(title = "Range size [sqkm]"), yaxis = list(title = "Value"), legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot margin = list(r = 150), # Add right margin to accommodate legend hovermode = 'closest', updatemenus = list( list( type = "dropdown", active = 0, buttons = list( list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") ) ) ) ) bslib::card(plot, full_screen = T) ``` #### Occupancy - Models for species with higher range occupancy showed slightly better performance ```{r range_occupancy, echo = FALSE, message=FALSE, warnings=FALSE} df_cells_total = range_maps_gridded %>% dplyr::rename("species" = name_matched) %>% group_by(species) %>% summarise(cells_total = n()) %>% st_drop_geometry() df_cells_occ <- range_maps_gridded %>% st_join(occs_final, join = st_intersects) %>% filter(name_matched == species) %>% # Filter only intersections of the same species group_by(species) %>% summarise(cells_occupied = n_distinct(geometry)) %>% st_drop_geometry() df_join = df_cells_total %>% dplyr::inner_join(df_cells_occ, by = "species") %>% dplyr::mutate(occupancy = cells_occupied / cells_total) %>% dplyr::select(species, occupancy) df_plot = performance %>% inner_join(df_join, by = "species") # Create base plot plot <- plot_ly( data = df_plot, x = ~occupancy, y = ~value, color = ~model, type = 'scatter', mode = 'markers', name = ~model, hoverinfo = 'text', text = ~paste("Species:", species, "<br>Range occupancy:", round(occupancy, 3), "<br>Value:", round(value, 3)), transforms = list( list( type = 'filter', target = ~metric, operation = '=', value = 'auc' # default value ) ) ) # Add dropdown for selecting metric plot <- plot %>% layout( title = "Model Performance vs. Range occupancy", xaxis = list(title = "Range occupancy"), yaxis = list(title = "Value"), legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot margin = list(r = 150), # Add right margin to accommodate legend hovermode = 'closest', updatemenus = list( list( type = "dropdown", active = 0, buttons = list( list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") ) ) ) ) bslib::card(plot, full_screen = T) ``` #### Spatial dispersion - Models for species with higher spatial dispersion tended to exhibit lower performance - Large spread, weak relationship ```{r spatial_dispersion, echo = FALSE, message=FALSE, warnings=FALSE} df_occs_total = occs_final %>% st_drop_geometry() %>% group_by(species) %>% summarise(occs_total = n()) df_join = df_occs_total %>% dplyr::inner_join(df_cells_total, by = "species") %>% dplyr::inner_join(df_cells_occ, by = "species") %>% dplyr::mutate(dispersion = cells_occupied / pmin(cells_total, occs_total)) df_plot = performance %>% inner_join(df_join, by = "species") # Create base plot plot <- plot_ly( data = df_plot, x = ~dispersion, y = ~value, color = ~model, type = 'scatter', mode = 'markers', name = ~model, hoverinfo = 'text', text = ~paste("Species:", species, "<br>Dispersion:", round(dispersion, 3), "<br>Value:", round(value, 3)), transforms = list( list( type = 'filter', target = ~metric, operation = '=', value = 'auc' # default value ) ) ) # Add dropdown for selecting metric plot <- plot %>% layout( title = "Model Performance vs. Dispersion", xaxis = list(title = "Dispersion"), yaxis = list(title = "Value"), legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot margin = list(r = 150), # Add right margin to accommodate legend hovermode = 'closest', updatemenus = list( list( type = "dropdown", active = 0, buttons = list( list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") ) ) ) ) bslib::card(plot, full_screen = T) ``` ### Functional group - No major difference in model performance among functional groups ```{r functional_groups, echo = FALSE, message=FALSE, warnings=FALSE} df_join = test_species %>% dplyr::distinct(name_matched, functional_group) %>% dplyr::mutate(functional_group = factor(functional_group, labels = c("arboreal", "large ground-dwelling", "small ground-dwelling"))) df_plot = performance %>% dplyr::left_join(df_join, by = c("species" = "name_matched")) plot <- plot_ly( data = df_plot, x = ~functional_group, y = ~value, color = ~model, type = 'box', boxpoints = "all", jitter = 1, pointpos = 0, hoverinfo = 'text', text = ~paste("Species:", species, "<br>Functional group:", functional_group, "<br>Value:", round(value, 3)), transforms = list( list( type = 'filter', target = ~metric, operation = '=', value = 'auc' # default value ) ) ) plot <- plot %>% layout( title = "Model Performance vs. Functional Group", xaxis = list(title = "Range occupancy"), yaxis = list(title = "Value"), legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot margin = list(r = 150), # Add right margin to accommodate legend hovermode = 'closest', boxmode = "group", updatemenus = list( list( type = "dropdown", active = 0, buttons = list( list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") ) ) ) ) bslib::card(plot, full_screen = T) ```