diff --git a/.gitignore b/.gitignore index 51c293e4bc2a9ef5d1fb9f55837fcbab31b2295f..4c95a7b17d2e3f0cf797fca434d0d6cf519e441d 100644 --- a/.gitignore +++ b/.gitignore @@ -11,5 +11,5 @@ renv/cache/ # Data files data/ -R/performance_analysis_files -R/performance_analysis.html \ No newline at end of file +R/05_performance_analysis_files +R/05_performance_analysis.html \ No newline at end of file diff --git a/R/04_01_ssdm_modeling.R b/R/04_01_ssdm_modeling.R index 5e8b92b7a0837b3cb50e377bc8af7d9017ee8dd2..22e35e6d3ecff5b2a26397c68074d29f583ea825 100644 --- a/R/04_01_ssdm_modeling.R +++ b/R/04_01_ssdm_modeling.R @@ -73,7 +73,7 @@ ssdm_results = furrr::future_map(pa_split, .options = furrr::furrr_options(seed # Gradient Boosted Machine #### gbm_performance = tryCatch({ gbm_grid <- expand.grid( - n.trees = c(100, 500, 1000, 1500), # Higher number of boosting iterations + n.trees = c(100, 500, 1000, 1500), # number of trees interaction.depth = c(3, 5, 7), # Maximum depth of each tree shrinkage = c(0.01, 0.005, 0.001), # Lower learning rates n.minobsinnode = c(10, 20) # Minimum number of observations in nodes diff --git a/R/05_performance_analysis.qmd b/R/05_performance_analysis.qmd index 8941c875e028412e74807ebb2bc9d5f49cd2de5b..a481b1d7032b42d65ce7cebb1a9b5f03aa5c83b4 100644 --- a/R/05_performance_analysis.qmd +++ b/R/05_performance_analysis.qmd @@ -2,6 +2,7 @@ title: "sSDM Performance analysis" format: html editor: visual +engine: knitr --- ```{r init, echo = FALSE, include = FALSE} @@ -19,22 +20,81 @@ load("../data/r_objects/functional_groups.RData") sf::sf_use_s2(use_s2 = FALSE) ``` +```{r globals, echo = FALSE, include = FALSE} +# Select metrics +focal_metrics = c("auc", "f1", "accuracy") # There's a weird bug in plotly that scrambles up lines when using more then three groups + +# Dropdown options +plotly_buttons = list() +for(metric in focal_metrics){ + plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", metric), label = metric) +} + +# Regression functions +asym_regression = function(x, y){ + nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) + new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) + data.frame( + x = new_x, + fit = predict(nls_fit, newdata = data.frame(x = new_x)) + ) +} + +lin_regression = function(x, y){ + glm_fit = suppressWarnings(glm(y~x, family = "binomial")) + new_x = seq(min(x), max(x), length.out = 100) + data.frame( + x = new_x, + fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") + ) +} + +``` + ## Summary -This document summarizes the performance of three SDM algorithms (Random Forest, Gradient Boosting Machine, Generalized Linear Model, Deep Neural Network) for `{r} length(unique(ssdm_results$species))` South American mammal species. We use six metrics (AUC, F1, kappa, accuracy, precision, and recall) to evaluate model performance and look at how performance varies with five factors (number of records, range size, range coverage, range coverage bias, and functional group). +This document summarizes the performance of four SDM algorithms (Random Forest, Gradient Boosting Machine, Generalized Linear Model, Deep Neural Network) for `r I(length(unique(ssdm_results$species)))` South American mammal species. We use `r I(length(focal_metrics))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) to evaluate model performance and look at how performance varies with five factors (number of records, range size, range coverage, range coverage bias, and functional group). + +### Modeling decisions: -Modeling decisions: +#### Data - Randomly sampled pseudo-absences from expanded area of extent of occurrence records (×1.25) - Balanced presences and absences for each species - Predictors: all 19 CHELSA bioclim variables -- Spatial block cross-validation - 70/30 Split of training vs. test data -- Grid search hyperparameter tuning for RF and GBM -Key findings: +#### Algorithms + +Random Forest + +- Spatial block cross-validation during training + +- Hyperparameter tuning of `mtry` + +Generalized boosted maching + +- Spatial block cross-validation during training + +- Hyperparameter tuning across `n.trees` , `interaction.depth` , `shrinkage`, `n.minobsinnode` + +Generalized Linear Model + +- Spatial block cross-validation during training + +- Logistic model with binomial link function + +Neural Netwok + +- Three hidden layers (sigmoid - 500, leaky_relu - 200, leaky_relu - 50) + +- Binomial loss + +- Adam optimizer + +### Key findings: -- Performance: RF > GBM > GLM worst >> NN +- Performance: RF \> GBM \> GLM \> NN - Convergence problems with Neural Notwork Models - More occurrence records and larger range sizes tended to improve model performance - Higher range coverage correlated with better performance. @@ -51,7 +111,8 @@ performance = ssdm_results %>% dplyr::mutate( metric = factor(metric, levels = c("auc", "kappa", "f1", "accuracy", "precision", "recall")), value = round(pmax(value, 0, na.rm = T), 3) # Fix one weird instance of f1 < 0 - ) + ) %>% + dplyr::filter(metric %in% focal_metrics) DT::datatable(performance) ``` @@ -64,29 +125,15 @@ DT::datatable(performance) ```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE} df_plot = performance -# Create base plot -plot <- plot_ly( - data = df_plot, - x = ~obs, - y = ~value, - color = ~model, - type = 'scatter', - mode = 'markers', - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Obervations:", obs, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = 'auc' # default value - ) - ) -) +# Calculate regression lines for each model and metric combination +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~asym_regression(.x$obs, .x$value)) +}) -# Add dropdown for selecting metric -plot <- plot %>% +# Create base plot +plot <- plot_ly() %>% layout( title = "Model Performance vs. Number of observations", xaxis = list(title = "Number of observations", type = "log"), @@ -98,18 +145,58 @@ plot <- plot %>% list( type = "dropdown", active = 0, - buttons = list( - list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), - list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), - list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), - list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), - list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), - list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") - ) + buttons = plotly_buttons ) ) ) +# Points +for (model_name in unique(df_plot$model)) { + plot = plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~obs, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Observations:", obs, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +# Add regression lines +for(model_name in unique(df_plot$model)){ + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE, + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + bslib::card(plot, full_screen = T) ``` @@ -131,138 +218,77 @@ df_join = range_maps %>% df_plot = performance %>% inner_join(df_join, by = c("species" = "name_matched")) -# Function to calculate regression line with confidence intervals -calculate_regression = function(data) { - # Fit log-linear model - model = lm(value ~ log(range_size), data = data) - new_x = data.frame(range_size = seq(min(data$range_size), max(data$range_size), length.out = 100)) - # Predict using log-transformed x - pred = predict(model, newdata = data.frame(range_size = log(new_x$range_size)), interval = "confidence") - data.frame( - range_size = new_x$range_size, # Keep original scale for plotting - fit = pred[,"fit"], - lower = pred[,"lwr"], - upper = pred[,"upr"] - ) -} - # Calculate regression lines for each model and metric combination -regression_lines = df_plot %>% - group_by(model, metric) %>% - group_modify(~calculate_regression(.x)) +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~asym_regression(.x$range_size, .x$value)) +}) -# Create base scatter plot -plot <- plot_ly( - data = df_plot, - x = ~range_size, - y = ~value, - color = ~model, - type = 'scatter', - mode = 'markers', - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range size:", round(range_size, -3), "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = 'auc' # default value +# Create base plot +plot <- plot_ly() %>% + layout( + title = "Model Performance vs. Range size", + xaxis = list(title = "Range size"), + yaxis = list(title = "Value"), + legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot + margin = list(r = 150), # Add right margin to accommodate legend + hovermode = 'closest', + updatemenus = list( + list( + type = "dropdown", + active = 0, + buttons = plotly_buttons + ) ) ) -) # Add regression lines and confidence intervals for each model for (model_name in unique(df_plot$model)) { plot <- plot %>% - add_trace( - data = regression_lines %>% filter(model == model_name), + add_markers( + data = filter(df_plot, model == model_name), x = ~range_size, - y = ~fit, - type = 'scatter', - mode = 'lines', - legendgroup = paste0(model_name, "regression"), - name = paste(model_name, '(fit)'), - line = list(width = 2), - text = NULL, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Range size:", range_size, "<br>Value:", round(value, 3)), transforms = list( list( type = 'filter', target = ~metric, operation = '=', - value = 'auc' + value = focal_metrics[1] ) ) - ) %>% - add_trace( - data = regression_lines %>% filter(model == model_name), - x = ~range_size, - y = ~upper, - type = 'scatter', - mode = 'lines', - legendgroup = paste0(model_name, "regression"), - line = list(width = 0), - showlegend = FALSE, - name = paste(model_name, '(upper CI)'), - text = NULL, - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = 'auc' - ) - ) - ) %>% - add_trace( - data = regression_lines %>% filter(model == model_name), - x = ~range_size, - y = ~lower, - type = 'scatter', - mode = 'lines', - legendgroup = paste0(model_name, "regression"), - fill = 'tonexty', - fillcolor = list(color = 'rgba(0,0,0,0.2)'), - line = list(width = 0), + ) +} + +for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), showlegend = FALSE, - name = paste(model_name, '(lower CI)'), - text = NULL, transforms = list( list( type = 'filter', target = ~metric, operation = '=', - value = 'auc' + value = focal_metrics[1] ) ) ) } -# Add layout with dropdown -plot <- plot %>% - layout( - title = "Model Performance vs. Range size", - xaxis = list(title = "Range size [sqkm]", type = "log"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), - margin = list(r = 150), - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = list( - list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), - list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), - list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), - list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), - list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), - list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") - ) - ) - ) - ) - bslib::card(plot, full_screen = T) ``` @@ -298,29 +324,15 @@ df_join = df_cells_total %>% df_plot = performance %>% inner_join(df_join, by = "species") -# Create base plot -plot <- plot_ly( - data = df_plot, - x = ~range_cov, - y = ~value, - color = ~model, - type = 'scatter', - mode = 'markers', - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range coverage:", round(range_cov, 3), "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = 'auc' # default value - ) - ) -) +# Calculate regression lines for each model and metric combination +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~lin_regression(.x$range_cov, .x$value)) +}) -# Add dropdown for selecting metric -plot <- plot %>% +# Create base plot +plot <- plot_ly() %>% layout( title = "Model Performance vs. Range coverage", xaxis = list(title = "Range coverage"), @@ -332,18 +344,57 @@ plot <- plot %>% list( type = "dropdown", active = 0, - buttons = list( - list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), - list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), - list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), - list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), - list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), - list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") - ) + buttons = plotly_buttons ) ) ) +# Add regression lines and confidence intervals for each model +for (model_name in unique(df_plot$model)) { + plot <- plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~range_cov, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Range coverage:", range_cov, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE, + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + bslib::card(plot, full_screen = T) ``` @@ -373,29 +424,15 @@ df_join = df_occs_total %>% df_plot = performance %>% inner_join(df_join, by = "species") -# Create base plot -plot <- plot_ly( - data = df_plot, - x = ~range_bias, - y = ~value, - color = ~model, - type = 'scatter', - mode = 'markers', - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range coverage bias:", round(range_bias, 3), "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = 'auc' # default value - ) - ) -) +# Calculate regression lines for each model and metric combination +suppressWarnings({ + regression_lines = df_plot %>% + group_by(model, metric) %>% + group_modify(~lin_regression(.x$range_bias, .x$value)) +}) -# Add dropdown for selecting metric -plot <- plot %>% +# Create base plot +plot <- plot_ly() %>% layout( title = "Model Performance vs. Range coverage bias", xaxis = list(title = "Range coverage bias"), @@ -407,18 +444,58 @@ plot <- plot %>% list( type = "dropdown", active = 0, - buttons = list( - list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), - list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), - list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), - list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), - list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), - list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") - ) + buttons = plotly_buttons ) ) ) +# Add regression lines and confidence intervals for each model +for (model_name in unique(df_plot$model)) { + plot <- plot %>% + add_markers( + data = filter(df_plot, model == model_name), + x = ~range_bias, + y = ~value, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + opacity = 0.6, + name = ~model, + hoverinfo = 'text', + text = ~paste("Species:", species, "<br>Range coverage bias:", range_bias, "<br>Value:", round(value, 3)), + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + +for (model_name in unique(df_plot$model)) { + reg_data = dplyr::filter(regression_lines, model == model_name) + plot = plot %>% + add_lines( + data = reg_data, + x = ~x, + y = ~fit, + color = model_name, # Set color to match legendgroup + legendgroup = model_name, + name = paste(model_name, '(fit)'), + showlegend = FALSE, + transforms = list( + list( + type = 'filter', + target = ~metric, + operation = '=', + value = focal_metrics[1] + ) + ) + ) +} + + bslib::card(plot, full_screen = T) ``` @@ -427,7 +504,7 @@ bslib::card(plot, full_screen = T) Functional groups were assigned based on taxonomic order. The following groupings were used: | Functional group | Taxomic orders | -|------------------|-----------------------------------------------------| +|-----------------------|-------------------------------------------------| | large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla | | small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha | | arboreal | Primates, Pilosa | @@ -445,7 +522,7 @@ plot <- plot_ly( y = ~value, color = ~model, type = 'box', - boxpoints = "all", + boxpoints = "all", jitter = 1, pointpos = 0, hoverinfo = 'text', @@ -455,7 +532,7 @@ plot <- plot_ly( type = 'filter', target = ~metric, operation = '=', - value = 'auc' # default value + value = focal_metrics[1] # default value ) ) ) @@ -473,14 +550,7 @@ plot <- plot %>% list( type = "dropdown", active = 0, - buttons = list( - list(method = "restyle", args = list("transforms[0].value", "auc"), label = "AUC"), - list(method = "restyle", args = list("transforms[0].value", "kappa"), label = "Kappa"), - list(method = "restyle", args = list("transforms[0].value", "f1"), label = "F1 score"), - list(method = "restyle", args = list("transforms[0].value", "accuracy"), label = "Accuracy"), - list(method = "restyle", args = list("transforms[0].value", "precision"), label = "Precision"), - list(method = "restyle", args = list("transforms[0].value", "recall"), label = "Recall") - ) + buttons = plotly_buttons ) ) ) diff --git a/R/_publish.yml b/R/_publish.yml new file mode 100644 index 0000000000000000000000000000000000000000..02920d16bb1021f602d5a92d12ad9761ba6a0334 --- /dev/null +++ b/R/_publish.yml @@ -0,0 +1,4 @@ +- source: 05_performance_analysis.qmd + quarto-pub: + - id: 309c45c3-1515-4985-a435-9ffa1888c5e7 + url: 'https://chrkoenig.quarto.pub/ssdm-performance-analysis-0a06'