From 7c726c394a9fd07d9942df28bcd9a7dfe808a536 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Thu, 7 Nov 2024 12:32:39 +0100 Subject: [PATCH] revamped ssdm performance analysis --- .gitignore | 4 +- R/04_01_ssdm_modeling.R | 2 +- R/05_performance_analysis.qmd | 496 +++++++++++++++++++--------------- R/_publish.yml | 4 + 4 files changed, 290 insertions(+), 216 deletions(-) create mode 100644 R/_publish.yml diff --git a/.gitignore b/.gitignore index 51c293e..4c95a7b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,5 +11,5 @@ renv/cache/ # Data files data/ -R/performance_analysis_files -R/performance_analysis.html \ No newline at end of file +R/05_performance_analysis_files +R/05_performance_analysis.html \ No newline at end of file diff --git a/R/04_01_ssdm_modeling.R b/R/04_01_ssdm_modeling.R index 5e8b92b..22e35e6 100644 --- a/R/04_01_ssdm_modeling.R +++ b/R/04_01_ssdm_modeling.R @@ -73,7 +73,7 @@ ssdm_results = furrr::future_map(pa_split, .options = furrr::furrr_options(seed # Gradient Boosted Machine #### gbm_performance = tryCatch({ gbm_grid <- expand.grid( - n.trees = c(100, 500, 1000, 1500), # Higher number of boosting iterations + n.trees = c(100, 500, 1000, 1500), # number of trees interaction.depth = c(3, 5, 7), # Maximum depth of each tree shrinkage = c(0.01, 0.005, 0.001), # Lower learning rates n.minobsinnode = c(10, 20) # Minimum number of observations in nodes diff --git a/R/05_performance_analysis.qmd b/R/05_performance_analysis.qmd index 8941c87..a481b1d 100644 --- a/R/05_performance_analysis.qmd +++ b/R/05_performance_analysis.qmd @@ -2,6 +2,7 @@ title: "sSDM Performance analysis" format: html editor: visual +engine: knitr --- ```{r init, echo = FALSE, include = FALSE} @@ -19,22 +20,81 @@ load("../data/r_objects/functional_groups.RData") sf::sf_use_s2(use_s2 = FALSE) ``` +```{r globals, echo = FALSE, include = FALSE} +# Select metrics +focal_metrics = c("auc", "f1", "accuracy") # There's a weird bug in plotly that scrambles up lines when using more then three groups + +# Dropdown options +plotly_buttons = list() +for(metric in focal_metrics){ + plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", metric), label = metric) +} + +# Regression functions +asym_regression = function(x, y){ + nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) + new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) + data.frame( + x = new_x, + fit = predict(nls_fit, newdata = data.frame(x = new_x)) + ) +} + +lin_regression = function(x, y){ + glm_fit = suppressWarnings(glm(y~x, family = "binomial")) + new_x = seq(min(x), max(x), length.out = 100) + data.frame( + x = new_x, + fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") + ) +} + +``` + ## Summary -This document summarizes the performance of three SDM algorithms (Random Forest, Gradient Boosting Machine, Generalized Linear Model, Deep Neural Network) for `{r} length(unique(ssdm_results$species))` South American mammal species. We use six metrics (AUC, F1, kappa, accuracy, precision, and recall) to evaluate model performance and look at how performance varies with five factors (number of records, range size, range coverage, range coverage bias, and functional group). +This document summarizes the performance of four SDM algorithms (Random Forest, Gradient Boosting Machine, Generalized Linear Model, Deep Neural Network) for `r I(length(unique(ssdm_results$species)))` South American mammal species. We use `r I(length(focal_metrics))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) to evaluate model performance and look at how performance varies with five factors (number of records, range size, range coverage, range coverage bias, and functional group). + +### Modeling decisions: -Modeling decisions: +#### Data - Randomly sampled pseudo-absences from expanded area of extent of occurrence records (×1.25) - Balanced presences and absences for each species - Predictors: all 19 CHELSA bioclim variables -- Spatial block cross-validation - 70/30 Split of training vs. test data -- Grid search hyperparameter tuning for RF and GBM -Key findings: +#### Algorithms + +Random Forest + +- Spatial block cross-validation during training + +- Hyperparameter tuning of `mtry` + +Generalized boosted maching + +- Spatial block cross-validation during training + +- Hyperparameter tuning across `n.trees` , `interaction.depth` , `shrinkage`, `n.minobsinnode` + +Generalized Linear Model + +- Spatial block cross-validation during training + +- Logistic model with binomial link function + +Neural Netwok + +- Three hidden layers (sigmoid - 500, leaky_relu - 200, leaky_relu - 50) + +- Binomial loss + +- Adam optimizer + +### Key findings: -- Performance: RF > GBM > GLM worst >> NN +- Performance: RF \> GBM \> GLM \> NN - Convergence problems with Neural Notwork Models - More occurrence records and larger range sizes tended to improve model performance - Higher range coverage correlated with better performance. @@ -51,7 +111,8 @@ performance = ssdm_results %>% dplyr::mutate( metric = factor(metric, levels = c("auc", "kappa", "f1", "accuracy", "precision", "recall")), value = round(pmax(value, 0, na.rm = T), 3) # Fix one weird instance of f1 < 0 - ) + ) %>% + dplyr::filter(metric %in% focal_metrics) DT::datatable(performance) ``` @@ -64,29 +125,15 @@ DT::datatable(performance) ```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE} df_plot = performance -# Create base plot -plot <- plot_ly( - data = df_plot, - x = ~obs, - y = ~value, - color = ~model, - type = 'scatter', - mode = 'markers', - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Obervations:", obs, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = 'auc' # default value - ) - ) -) +# Calculate regression lines for each model and metric combination +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~asym_regression(.x$obs, .x$value)) +}) -# Add dropdown for selecting metric -plot <- plot %>% +# Create base plot +plot <- plot_ly() %>% layout( title = "Model Performance vs. Number of observations", xaxis = list(title = "Number of observations", type = "log"), @@ -98,18 +145,58 @@ plot <- plot %>% list( type = "dropdown", active = 0, - buttons = list( - list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), - list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), - list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), - list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), - list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), - list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") - ) + buttons = plotly_buttons ) ) ) +# Points +for (model_name in unique(df_plot$model)) { + plot = plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~obs, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Observations:", obs, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +# Add regression lines +for(model_name in unique(df_plot$model)){ + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE, + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + bslib::card(plot, full_screen = T) ``` @@ -131,138 +218,77 @@ df_join = range_maps %>% df_plot = performance %>% inner_join(df_join, by = c("species" = "name_matched")) -# Function to calculate regression line with confidence intervals -calculate_regression = function(data) { - # Fit log-linear model - model = lm(value ~ log(range_size), data = data) - new_x = data.frame(range_size = seq(min(data$range_size), max(data$range_size), length.out = 100)) - # Predict using log-transformed x - pred = predict(model, newdata = data.frame(range_size = log(new_x$range_size)), interval = "confidence") - data.frame( - range_size = new_x$range_size, # Keep original scale for plotting - fit = pred[,"fit"], - lower = pred[,"lwr"], - upper = pred[,"upr"] - ) -} - # Calculate regression lines for each model and metric combination -regression_lines = df_plot %>% - group_by(model, metric) %>% - group_modify(~calculate_regression(.x)) +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~asym_regression(.x$range_size, .x$value)) +}) -# Create base scatter plot -plot <- plot_ly( - data = df_plot, - x = ~range_size, - y = ~value, - color = ~model, - type = 'scatter', - mode = 'markers', - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range size:", round(range_size, -3), "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = 'auc' # default value +# Create base plot +plot <- plot_ly() %>% + layout( + title = "Model Performance vs. Range size", + xaxis = list(title = "Range size"), + yaxis = list(title = "Value"), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest', + updatemenus = list( + list( + type = "dropdown", + active = 0, + buttons = plotly_buttons + ) ) ) -) # Add regression lines and confidence intervals for each model for (model_name in unique(df_plot$model)) { plot <- plot %>% - add_trace( - data = regression_lines %>% filter(model == model_name), + add_markers( + data = filter(df_plot, model == model_name), x = ~range_size, - y = ~fit, - type = 'scatter', - mode = 'lines', - legendgroup = paste0(model_name, "regression"), - name = paste(model_name, '(fit)'), - line = list(width = 2), - text = NULL, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Range size:", range_size, "<br>Value:", round(value, 3)), transforms = list( list( type = 'filter', target = ~metric, operation = '=', - value = 'auc' + value = focal_metrics[1] ) ) - ) %>% - add_trace( - data = regression_lines %>% filter(model == model_name), - x = ~range_size, - y = ~upper, - type = 'scatter', - mode = 'lines', - legendgroup = paste0(model_name, "regression"), - line = list(width = 0), - showlegend = FALSE, - name = paste(model_name, '(upper CI)'), - text = NULL, - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = 'auc' - ) - ) - ) %>% - add_trace( - data = regression_lines %>% filter(model == model_name), - x = ~range_size, - y = ~lower, - type = 'scatter', - mode = 'lines', - legendgroup = paste0(model_name, "regression"), - fill = 'tonexty', - fillcolor = list(color = 'rgba(0,0,0,0.2)'), - line = list(width = 0), + ) +} + +for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), showlegend = FALSE, - name = paste(model_name, '(lower CI)'), - text = NULL, transforms = list( list( type = 'filter', target = ~metric, operation = '=', - value = 'auc' + value = focal_metrics[1] ) ) ) } -# Add layout with dropdown -plot <- plot %>% - layout( - title = "Model Performance vs. Range size", - xaxis = list(title = "Range size [sqkm]", type = "log"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), - margin = list(r = 150), - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = list( - list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), - list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), - list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), - list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), - list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), - list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") - ) - ) - ) - ) - bslib::card(plot, full_screen = T) ``` @@ -298,29 +324,15 @@ df_join = df_cells_total %>% df_plot = performance %>% inner_join(df_join, by = "species") -# Create base plot -plot <- plot_ly( - data = df_plot, - x = ~range_cov, - y = ~value, - color = ~model, - type = 'scatter', - mode = 'markers', - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range coverage:", round(range_cov, 3), "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = 'auc' # default value - ) - ) -) +# Calculate regression lines for each model and metric combination +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~lin_regression(.x$range_cov, .x$value)) +}) -# Add dropdown for selecting metric -plot <- plot %>% +# Create base plot +plot <- plot_ly() %>% layout( title = "Model Performance vs. Range coverage", xaxis = list(title = "Range coverage"), @@ -332,18 +344,57 @@ plot <- plot %>% list( type = "dropdown", active = 0, - buttons = list( - list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), - list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), - list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), - list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), - list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), - list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") - ) + buttons = plotly_buttons ) ) ) +# Add regression lines and confidence intervals for each model +for (model_name in unique(df_plot$model)) { + plot <- plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~range_cov, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Range coverage:", range_cov, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE, + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + bslib::card(plot, full_screen = T) ``` @@ -373,29 +424,15 @@ df_join = df_occs_total %>% df_plot = performance %>% inner_join(df_join, by = "species") -# Create base plot -plot <- plot_ly( - data = df_plot, - x = ~range_bias, - y = ~value, - color = ~model, - type = 'scatter', - mode = 'markers', - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range coverage bias:", round(range_bias, 3), "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = 'auc' # default value - ) - ) -) +# Calculate regression lines for each model and metric combination +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~lin_regression(.x$range_bias, .x$value)) +}) -# Add dropdown for selecting metric -plot <- plot %>% +# Create base plot +plot <- plot_ly() %>% layout( title = "Model Performance vs. Range coverage bias", xaxis = list(title = "Range coverage bias"), @@ -407,18 +444,58 @@ plot <- plot %>% list( type = "dropdown", active = 0, - buttons = list( - list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), - list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), - list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), - list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), - list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), - list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") - ) + buttons = plotly_buttons ) ) ) +# Add regression lines and confidence intervals for each model +for (model_name in unique(df_plot$model)) { + plot <- plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~range_bias, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Range coverage bias:", range_bias, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE, + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + + bslib::card(plot, full_screen = T) ``` @@ -427,7 +504,7 @@ bslib::card(plot, full_screen = T) Functional groups were assigned based on taxonomic order. The following groupings were used: | Functional group | Taxomic orders | -|------------------|-----------------------------------------------------| +|-----------------------|-------------------------------------------------| | large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla | | small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha | | arboreal | Primates, Pilosa | @@ -445,7 +522,7 @@ plot <- plot_ly( y = ~value, color = ~model, type = 'box', - boxpoints = "all", + boxpoints = "all", jitter = 1, pointpos = 0, hoverinfo = 'text', @@ -455,7 +532,7 @@ plot <- plot_ly( type = 'filter', target = ~metric, operation = '=', - value = 'auc' # default value + value = focal_metrics[1] # default value ) ) ) @@ -473,14 +550,7 @@ plot <- plot %>% list( type = "dropdown", active = 0, - buttons = list( - list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), - list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), - list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), - list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), - list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), - list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") - ) + buttons = plotly_buttons ) ) ) diff --git a/R/_publish.yml b/R/_publish.yml new file mode 100644 index 0000000..02920d1 --- /dev/null +++ b/R/_publish.yml @@ -0,0 +1,4 @@ +- source: 05_performance_analysis.qmd + quarto-pub: + - id: 309c45c3-1515-4985-a435-9ffa1888c5e7 + url: 'https://chrkoenig.quarto.pub/ssdm-performance-analysis-0a06' -- GitLab