--- title: "SDM Performance analysis" format: html editor: visual engine: knitr --- ```{r init, echo = FALSE, include = FALSE} library(tidyverse) library(sf) library(plotly) library(DT) load("../data/r_objects/model_data.RData") load("../data/r_objects/ssdm_results.RData") load("../data/r_objects/msdm_embed_performance.RData") load("../data/r_objects/range_maps.RData") load("../data/r_objects/range_maps_gridded.RData") load("../data/r_objects/occs_final.RData") load("../data/r_objects/functional_groups.RData") sf::sf_use_s2(use_s2 = FALSE) ``` ```{r globals, echo = FALSE, include = FALSE} # Count occs per species obs_count = model_data %>% sf::st_drop_geometry() %>% dplyr::filter(present == 1) %>% dplyr::group_by(species) %>% dplyr::summarise(obs = n()) # Regression functions asym_regression = function(x, y){ nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) data.frame( x = new_x, fit = predict(nls_fit, newdata = data.frame(x = new_x)) ) } lin_regression = function(x, y, family = "binomial"){ glm_fit = suppressWarnings(glm(y~x, family = family)) new_x = seq(min(x), max(x), length.out = 100) data.frame( x = new_x, fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") ) } # Performance table performance = ssdm_results %>% bind_rows(msdm_embed_performance) %>% dplyr::group_by(species, model, metric) %>% dplyr::summarise(value = mean(value, na.rm = F)) %>% dplyr::mutate( metric = stringr::str_to_lower(metric), value = case_when( ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5, ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0, .default = value ) ) focal_metrics = unique(performance$metric) plot_performance = function(df_plot, metric, x_var, x_label, x_log = T, reg_func = lin_regression) { df_plot = dplyr::filter(df_plot, metric == !!metric) %>% dplyr::rename(x_var = !!x_var) # Calculate regression lines for each model and metric combination suppressWarnings({ regression_lines = df_plot %>% group_by(model) %>% group_modify( ~ reg_func(.x$x_var, .x$value)) }) # Create base plot plot <- plot_ly() %>% layout( title = paste0("Model Performance vs. ", x_label), xaxis = list(title = x_label, type = ifelse(x_log, "log", "linear")), yaxis = list(title = metric), legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot margin = list(r = 150), # Add right margin to accommodate legend hovermode = 'closest' ) # Points for (model_name in unique(df_plot$model)) { plot = plot %>% add_markers( data = filter(df_plot, model == model_name, metric %in% focal_metrics), x = ~ x_var, y = ~ value, color = model_name, # Set color to match legendgroup legendgroup = model_name, opacity = 0.6, name = ~ model, hoverinfo = 'text', text = ~ paste( "Species:", species, "<br>", x_label, ":", x_var, "<br>Value:", round(value, 3) ) ) } # Add regression lines for (model_name in unique(df_plot$model)) { reg_data = dplyr::filter(regression_lines, model == model_name) plot = plot %>% add_lines( data = reg_data, x = ~ x, y = ~ fit, color = model_name, # Set color to match legendgroup legendgroup = model_name, name = paste(model_name, '(fit)'), showlegend = FALSE ) } return(plot) } ``` ## Summary This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest. Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling). ### Modeling overview: #### General decisions - Balanced presences and absences for each species - Absence sampling from probability surface that balances geographical sampling bias and environmental preferences per species - higher probability in areas that have been sampled more intensely - lower probability in areas with environmental conditions similar to presence locations {width="100%" height="800"} - Eight predictors: - Min Temperature of Coldest Month (bio6) - Precipitation of Driest Quarter (bio17) - Climate Moisture Index (cmi) - Surface Downwelling Shortwave Flux (rsds) - Tree canopy cover (igfc) - Distance to freshwater (dtfw) - Seasonal inundation(igsw) - Topographic roughness (roughness) - Nested cross validation - Outer (performance evaluation): Spatially blocked 5-fold cross validation for all models - Inner (model training): Random split (1/5) in cito, Spatially blocked 5-fold cross validation in other models #### sSDM Algorithms - Four Algorithms: Random Forest (RF), Gradient Boosting Machine (GBM), Generalized Additive Model (GAM), Neural Network (NN) - NN: Manual hyperparameter tuning, same settings across species - RF + GBM + GAM: Automated hyperparameter tuning (4 random combinations) per species #### mSDM Algorithms Multispecies Neural Network with species embedding (**MSDM_embed**) - prediction: probability of occurrence given a set of (environmental) inputs and species identity - embedding initialized at random - three hidden layers, sigmoid + leaky ReLu activations, binomial loss ### Key findings: TBA ## Analysis The table below shows the analysed modeling results. ```{r performance, echo = FALSE, message=FALSE, warnings=FALSE} DT::datatable(performance) %>% formatRound(columns="value", digits=3) ``` ### Number of records - Model performance was generally better for species with more observations - Very poor performance below 50-100 observations ```{r, echo = FALSE, message=FALSE, warnings=FALSE} df_plot = performance %>% dplyr::left_join(obs_count, by = "species") ``` ::: panel-tabset #### AUC ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "auc", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### F1 ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "f1", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Cohen's kappa ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "kappa", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Accuracy ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "accuracy", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Precision ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "precision", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Recall ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "recall", x_var = "obs", x_label = "Number of records", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` ::: ### Range size Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016). ```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE} df_join = range_maps %>% dplyr::mutate(range_size = as.numeric(sf::st_area(range_maps) / 1000000)) %>% # range in sqkm sf::st_drop_geometry() df_plot = performance %>% inner_join(df_join, by = c("species" = "name_matched")) ``` ::: panel-tabset #### AUC ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "auc", x_var = "range_size", x_label = "Range size") bslib::card(plot, full_screen = T) ``` #### F1 ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "f1", x_var = "range_size", x_label = "Range size") bslib::card(plot, full_screen = T) ``` #### Cohen's kappa ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "kappa", x_var = "range_size", x_label = "Range size", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Accuracy ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_size", x_label = "Range size") bslib::card(plot, full_screen = T) ``` #### Precision ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "precision", x_var = "range_size", x_label = "Range size") bslib::card(plot, full_screen = T) ``` #### Recall ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "recall", x_var = "range_size", x_label = "Range size") bslib::card(plot, full_screen = T) ``` ::: ### Range coverage Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells. $$ RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}} $$ ```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE} occs_final_unproj = occs_final %>% sf::st_transform("EPSG:4326") df_cells_total = range_maps_gridded %>% dplyr::rename("species" = name_matched) %>% group_by(species) %>% summarise(cells_total = n()) %>% st_drop_geometry() df_cells_occ <- range_maps_gridded %>% st_join(occs_final_unproj, join = st_intersects) %>% filter(name_matched == species) %>% # Filter only intersections of the same species group_by(species) %>% summarise(cells_occupied = n_distinct(geometry)) %>% st_drop_geometry() df_join = df_cells_total %>% dplyr::inner_join(df_cells_occ, by = "species") %>% dplyr::mutate(range_cov = cells_occupied / cells_total) %>% dplyr::select(species, range_cov) df_plot = performance %>% inner_join(df_join, by = "species") ``` ::: panel-tabset #### AUC ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "auc", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### F1 ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "f1", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Cohen's kappa ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "kappa", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Accuracy ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Precision ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "precision", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` #### Recall ```{r echo = FALSE} plot = plot_performance(df_plot, metric = "recall", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression) bslib::card(plot, full_screen = T) ``` :::