--- title: "SDM Performance report" format: html editor: visual engine: knitr --- ```{r init, echo = FALSE, include = FALSE} library(tidyverse) library(sf) library(plotly) library(DT) load("../data/r_objects/model_data.RData") load("../data/r_objects/ssdm_results.RData") load("../data/r_objects/msdm_embed_performance.RData") load("../data/r_objects/msdm_onehot_performance.RData") load("../data/r_objects/msdm_rf_performance.RData") load("../data/r_objects/range_maps.RData") load("../data/r_objects/range_maps_gridded.RData") load("../data/r_objects/occs_final.RData") load("../data/r_objects/functional_groups.RData") sf::sf_use_s2(use_s2 = FALSE) ``` ```{r globals, echo = FALSE, include = FALSE} # Count occs per species obs_count = model_data %>% sf::st_drop_geometry() %>% dplyr::filter(present == 1) %>% dplyr::group_by(species) %>% dplyr::summarise(obs = n()) # Regression functions asym_regression = function(x, y){ nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) data.frame( x = new_x, fit = predict(nls_fit, newdata = data.frame(x = new_x)) ) } lin_regression = function(x, y, family = "binomial"){ glm_fit = suppressWarnings(glm(y~x, family = family)) new_x = seq(min(x), max(x), length.out = 100) data.frame( x = new_x, fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") ) } # Performance table performance = ssdm_results %>% bind_rows(msdm_embed_performance) %>% bind_rows(msdm_onehot_performance) %>% bind_rows(msdm_rf_performance) %>% dplyr::filter(fold_eval <= 3) %>% # TODO use all folds dplyr::group_by(species, model, metric) %>% dplyr::summarise(value = mean(value, na.rm = F)) %>% dplyr::mutate( metric = stringr::str_to_lower(metric), value = case_when( ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5, ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0, .default = value ) ) focal_metrics = c("accuracy", "auc", "f1", "kappa", "precision", "recall") plot_performance = function(df_plot, metric, x_var, x_label, x_log = T, reg_func = lin_regression) { df_plot = dplyr::filter(df_plot, metric == !!metric) %>% dplyr::rename(x_var = !!x_var) # Calculate regression lines for each model and metric combination suppressWarnings({ regression_lines = df_plot %>% group_by(model) %>% group_modify( ~ reg_func(.x$x_var, .x$value)) }) # Create base plot plot <- plot_ly() %>% layout( title = paste0("Model Performance vs. ", x_label), xaxis = list(title = x_label, type = ifelse(x_log, "log", "linear")), yaxis = list(title = metric), legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot margin = list(r = 150), # Add right margin to accommodate legend hovermode = 'closest' ) # Points for (model_name in unique(df_plot$model)) { plot = plot %>% add_markers( data = filter(df_plot, model == model_name, metric %in% focal_metrics), x = ~ x_var, y = ~ value, color = model_name, # Set color to match legendgroup legendgroup = model_name, opacity = 0.6, name = ~ model, hoverinfo = 'text', text = ~ paste( "Species:", species, "<br>", x_label, ":", x_var, "<br>Value:", round(value, 3) ) ) } # Add regression lines for (model_name in unique(df_plot$model)) { reg_data = dplyr::filter(regression_lines, model == model_name) plot = plot %>% add_lines( data = reg_data, x = ~ x, y = ~ fit, color = model_name, # Set color to match legendgroup legendgroup = model_name, name = paste(model_name, '(fit)'), showlegend = FALSE ) } return(plot) } ``` ## Summary This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias). The comparison of sSDM vs mSDM approaches is of particular interest. Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling). ### Modeling overview: #### General decisions - Balanced presences and absences for each species - Absence sampling from probability surface that balances geographical sampling bias and environmental preferences per species - higher probability in areas that have been sampled more intensely - lower probability in areas with environmental conditions similar to presence locations - Eight predictors: - Min Temperature of Coldest Month (bio6) - Precipitation of Driest Quarter (bio17) - Climate Moisture Index (cmi) - Surface Downwelling Shortwave Flux (rsds) - Tree canopy cover (igfc) - Distance to freshwater (dtfw) - Seasonal inundation(igsw) - Topographic roughness (roughness) - Nested cross validation - Outer (performance evaluation): Spatially blocked 5-fold cross validation for all models - Inner (model training): Random split (1/5) in cito, Spatially blocked 5-fold cross validation in other models #### sSDM Algorithms - Four algorithms: Random Forest (RF), Gradient Boosting Machine (GBM), Generalized Additive Model (GAM), Neural Network (NN) - NN: Manual hyperparameter tuning, same settings across species - RF + GBM + GAM: Automated hyperparameter tuning (8 random combinations) per species #### mSDM Algorithms - Three algorithms: Random Forest (MSDM_rf), Neural Network (MSDM_embed, MSDM_onehot) - Species identity part of the input data, internal representation then either as onehot vector (MSDM_rf, MSDM_onehot) or via embedding (MSDM_embed) ### Key findings: - MSDM algorithms score much higher across all performance algorithms - Among MSDM algorithms, RF outperforms NNs significantly ## Analysis ### Number of records ```{r, echo = FALSE, message=FALSE, warnings=FALSE} df_plot = performance %>% dplyr::left_join(obs_count, by = "species") ``` ::: panel-tabset #### AUC ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "auc", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### F1 ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "f1", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Cohen's kappa ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "kappa", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Accuracy ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "accuracy", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Precision ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "precision", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Recall ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "recall", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` ::: ### Range size Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016). ```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE} df_join = range_maps %>% dplyr::mutate(range_size = as.numeric(sf::st_area(range_maps) / 1000000)) %>% # range in sqkm sf::st_drop_geometry() df_plot = performance %>% inner_join(df_join, by = c("species" = "name_matched")) ``` ::: panel-tabset #### AUC ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "auc", x_var = "range_size", x_label = "Range size") bslib::card(plot, full_screen = T) ``` #### F1 ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "f1", x_var = "range_size", x_label = "Range size") bslib::card(plot, full_screen = T) ``` #### Cohen's kappa ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "kappa", x_var = "range_size", x_label = "Range size", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Accuracy ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_size", x_label = "Range size") bslib::card(plot, full_screen = T) ``` #### Precision ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "precision", x_var = "range_size", x_label = "Range size") bslib::card(plot, full_screen = T) ``` #### Recall ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "recall", x_var = "range_size", x_label = "Range size") bslib::card(plot, full_screen = T) ``` ::: ### Range coverage Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells. $$ RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}} $$ ```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE} occs_final_unproj = occs_final %>% sf::st_transform("EPSG:4326") df_cells_total = range_maps_gridded %>% dplyr::rename("species" = name_matched) %>% group_by(species) %>% summarise(cells_total = n()) %>% st_drop_geometry() df_cells_occ <- range_maps_gridded %>% st_join(occs_final_unproj, join = st_intersects) %>% filter(name_matched == species) %>% # Filter only intersections of the same species group_by(species) %>% summarise(cells_occupied = n_distinct(geometry)) %>% st_drop_geometry() df_join = df_cells_total %>% dplyr::inner_join(df_cells_occ, by = "species") %>% dplyr::mutate(range_cov = cells_occupied / cells_total) %>% dplyr::select(species, range_cov) df_plot = performance %>% inner_join(df_join, by = "species") ``` ::: panel-tabset #### AUC ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "auc", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### F1 ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "f1", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Cohen's kappa ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "kappa", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Accuracy ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Precision ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "precision", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Recall ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "recall", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` ::: ### Range coverage bias Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range. $$ RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)} $$ Higher bias values indicate that occurrence records are spatially more clustered within the range of the species. ```{r range_coverage_bias, echo = FALSE, message=FALSE, warnings=FALSE} occs_final_unproj = occs_final %>% sf::st_transform("EPSG:4326") df_occs_total = occs_final_unproj %>% st_drop_geometry() %>% group_by(species) %>% summarise(occs_total = n()) df_join = df_occs_total %>% dplyr::inner_join(df_cells_total, by = "species") %>% dplyr::inner_join(df_cells_occ, by = "species") %>% dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1))) df_plot = performance %>% inner_join(df_join, by = "species") ``` ::: panel-tabset #### AUC ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "auc", x_var = "range_bias", x_label = "Range coverage bias", x_log = F) bslib::card(plot, full_screen = T) ``` #### F1 ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "f1", x_var = "range_bias", x_label = "Range coverage bias", x_log = F) bslib::card(plot, full_screen = T) ``` #### Accuracy ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_bias", x_label = "Range coverage bias", x_log = F) bslib::card(plot, full_screen = T) ``` #### Precision ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "precision", x_var = "range_bias", x_label = "Range coverage bias", x_log = F) bslib::card(plot, full_screen = T) ``` #### Recall ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "recall", x_var = "range_bias", x_label = "Range coverage bias", x_log = F) bslib::card(plot, full_screen = T) ``` :::