diff --git a/R/04_02_msdm_modeling.R b/R/04_02_msdm_modeling.R index 84f9ba5c2cd4faab3ce35ebc45b2331397ac0fd4..a9e5eea27027b37d37845d18320ac91ae19990ec 100644 --- a/R/04_02_msdm_modeling.R +++ b/R/04_02_msdm_modeling.R @@ -23,7 +23,7 @@ msdm_fit = dnn( hidden = c(500L, 1000L, 1000L), loss = "binomial", activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 10000L, + epochs = 25000L, lr = 0.01, baseloss = 1, batchsize = nrow(train_data)/5, @@ -36,12 +36,12 @@ msdm_fit = dnn( device = "cuda" ) -save(msdm_fit, file = "data/r_objects/msdm_fit.RData") +save(msdm_fit, file = "data/r_objects/msdm_fit2.RData") # ----------------------------------------------------------------------# # Evaluate model #### # ----------------------------------------------------------------------# -load("data/r_objects/msdm_fit.RData") +load("data/r_objects/msdm_fit2.RData") # Overall preds_train = predict(msdm_fit, newdata = as.matrix(train_data), type = "response") @@ -56,7 +56,8 @@ eval_overall = evaluate_model(msdm_fit, test_data) data_split = split(model_data, model_data$species) msdm_results = lapply(data_split, function(data_spec){ - test_data = dplyr::filter(data_spec, train == 0) + test_data = dplyr::filter(data_spec, train == 0) %>% + dplyr::select(-species) msdm_performance = tryCatch({ evaluate_model(msdm_fit, test_data) @@ -77,4 +78,4 @@ msdm_results = lapply(data_split, function(data_spec){ ) }) %>% bind_rows() -save(msdm_results, file = "data/r_objects/msdm_results.RData") +save(msdm_results, file = "data/r_objects/msdm_results2.RData") diff --git a/R/05_performance_analysis.qmd b/R/05_performance_analysis.qmd index c2d7d442fe8689080c38bbbdc8acf5e926ab8c2a..7eb33f11dc84d6568cbb81df38cabb2cd9bd4a9a 100644 --- a/R/05_performance_analysis.qmd +++ b/R/05_performance_analysis.qmd @@ -12,8 +12,10 @@ library(sf) library(plotly) library(DT) + load("../data/r_objects/ssdm_results.RData") -load("../data/r_objects/msdm_results.RData") +load("../data/r_objects/msdm_fit2.RData") +load("../data/r_objects/msdm_results2.RData") load("../data/r_objects/range_maps.RData") load("../data/r_objects/range_maps_gridded.RData") load("../data/r_objects/occs_final.RData") @@ -54,7 +56,7 @@ lin_regression = function(x, y){ ## Summary -This document summarizes the performance of four SDM algorithms (Random Forest, Gradient Boosting Machine, Generalized Linear Model, Deep Neural Network) for `r I(length(unique(ssdm_results$species)))` South American mammal species. We use `r I(length(focal_metrics))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) to evaluate model performance and look at how performance varies with five factors (number of records, range size, range coverage, range coverage bias, and functional group). +This document summarizes the performance of four SDM algorithms (Random Forest, Gradient Boosting Machine, Generalized Linear Model, Deep Neural Network) for `r I(length(unique(ssdm_results$species)))` South American mammal species. We use `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) to evaluate model performance and look at how performance varies with five factors (number of records, range size, range coverage, range coverage bias, and functional group). ### Modeling decisions: @@ -70,31 +72,30 @@ This document summarizes the performance of four SDM algorithms (Random Forest, Random Forest - Spatial block cross-validation during training - - Hyperparameter tuning of `mtry` -Generalized boosted maching +Generalized boosted machine - Spatial block cross-validation during training - - Hyperparameter tuning across `n.trees` , `interaction.depth` , `shrinkage`, `n.minobsinnode` Generalized Linear Model - Spatial block cross-validation during training - - Logistic model with binomial link function Neural Netwok (single-species) - Three hidden layers (sigmoid - 500, leaky_relu - 200, leaky_relu - 50) - -- Binomial loss - -- Adam optimizer +- Binomial loss, ADAM optimizer +- Poor convergence Neural Network (multi-species) +- Three hidden layers (sigmoid - 500, leaky_relu - 1000, leaky_relu - 1000) +- Binomial loss, ADAM optimizer +- very slow convergence (`r I(length(which(!is.na(msdm_fit$losses$train_l))))` epochs) + ### Key findings: - Performance: RF \> GBM \> GLM \> NN @@ -507,7 +508,7 @@ bslib::card(plot, full_screen = T) Functional groups were assigned based on taxonomic order. The following groupings were used: | Functional group | Taxomic orders | -|------------------------|------------------------------------------------| +|------------------------|-----------------------------------------------| | large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla | | small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha | | arboreal | Primates, Pilosa | @@ -560,3 +561,26 @@ plot <- plot %>% bslib::card(plot, full_screen = T) ``` + +### Relative performance + +The table below summarizes the relative performance of the models across different observation frequency ranges. The `rank` column indicates the model's performance rank compared to all other models for a given combination of model and metric. The subsequent columns, `(1,10]`, `(10,25]`, ..., `(5000, Inf]`, represent bins of observation frequency. The values in these columns show how many times the model's performance was ranked at the specified `rank` within the respective frequency range. + +```{r msdm_vs_ssdm, echo = FALSE, message=FALSE, warnings=FALSE} +freq_thresholds = c(1, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, Inf) + +df_print = performance %>% + mutate(freq_class = cut(obs, freq_thresholds, dig.lab = 5)) %>% + group_by(species, metric, freq_class) %>% + dplyr::mutate(rank = order(value, decreasing = T)) %>% + group_by(model, metric, rank, freq_class) %>% + tally() %>% + pivot_wider(names_from = freq_class, values_from = n) %>% + dplyr::select("model", "metric", "rank", "(1,10]", "(10,25]", + "(25,50]", "(50,100]", "(100,250]", "(250,500]", + "(500,1000]", "(1000,2500]", "(2500,5000]", "(5000,Inf]") %>% + dplyr::arrange(metric, rank, model) %>% + replace(is.na(.), 0) + +DT::datatable(df_print) +``` diff --git a/R/05_performance_analysis.rmarkdown b/R/05_performance_analysis.rmarkdown deleted file mode 100644 index d82a62297406fbfbed633ecbdc7ecba1cbc828bf..0000000000000000000000000000000000000000 --- a/R/05_performance_analysis.rmarkdown +++ /dev/null @@ -1,575 +0,0 @@ ---- -title: "sSDM Performance analysis" -format: html -editor: visual -engine: knitr ---- - -```{r init, echo = FALSE, include = FALSE} -library(tidyverse) -library(Symobio) -library(sf) -library(plotly) -library(DT) - -load("../data/r_objects/ssdm_results.RData") -load("../data/r_objects/msdm_results.RData") -load("../data/r_objects/range_maps.RData") -load("../data/r_objects/range_maps_gridded.RData") -load("../data/r_objects/occs_final.RData") -load("../data/r_objects/functional_groups.RData") -sf::sf_use_s2(use_s2 = FALSE) -``` - -```{r globals, echo = FALSE, include = FALSE} -# Select metrics -focal_metrics = c("auc", "f1", "accuracy") # There's a weird bug in plotly that scrambles up lines when using more then three groups - -# Dropdown options -plotly_buttons = list() -for(metric in focal_metrics){ - plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", metric), label = metric) -} - -# Regression functions -asym_regression = function(x, y){ - nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1)) - new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100)) - data.frame( - x = new_x, - fit = predict(nls_fit, newdata = data.frame(x = new_x)) - ) -} - -lin_regression = function(x, y){ - glm_fit = suppressWarnings(glm(y~x, family = "binomial")) - new_x = seq(min(x), max(x), length.out = 100) - data.frame( - x = new_x, - fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response") - ) -} - -``` - - -## Summary - -This document summarizes the performance of four SDM algorithms (Random Forest, Gradient Boosting Machine, Generalized Linear Model, Deep Neural Network) for `r I(length(unique(ssdm_results$species)))` South American mammal species. We use `r I(length(focal_metrics))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) to evaluate model performance and look at how performance varies with five factors (number of records, range size, range coverage, range coverage bias, and functional group). - -### Modeling decisions: - -#### Data - -- Randomly sampled pseudo-absences from expanded area of extent of occurrence records (×1.25) -- Balanced presences and absences for each species -- Predictors: all 19 CHELSA bioclim variables -- 70/30 Split of training vs. test data - -#### Algorithms - -Random Forest - -- Spatial block cross-validation during training - -- Hyperparameter tuning of `mtry` - -Generalized boosted maching - -- Spatial block cross-validation during training - -- Hyperparameter tuning across `n.trees` , `interaction.depth` , `shrinkage`, `n.minobsinnode` - -Generalized Linear Model - -- Spatial block cross-validation during training - -- Logistic model with binomial link function - -Neural Netwok (single-species) - -- Three hidden layers (sigmoid - 500, leaky_relu - 200, leaky_relu - 50) - -- Binomial loss - -- Adam optimizer - -Neural Network (multi-species) - -### Key findings: - -- Performance: RF \> GBM \> GLM \> NN -- Convergence problems with Neural Notwork Models -- More occurrence records and larger range sizes tended to improve model performance -- Higher range coverage correlated with better performance. -- Range coverage bias and functional group showed some impact but were less consistent - -## Analysis - -The table below shows the analysed modeling results. - - -```{r performance, echo = FALSE, message=FALSE, warnings=FALSE} -performance = bind_rows(ssdm_results, msdm_results) %>% - pivot_longer(c(auc, accuracy, kappa, precision, recall, f1), names_to = "metric") %>% - dplyr::filter(!is.na(value)) %>% - dplyr::mutate( - metric = factor(metric, levels = c("auc", "kappa", "f1", "accuracy", "precision", "recall")), - value = round(pmax(value, 0, na.rm = T), 3) # Fix one weird instance of f1 < 0 - ) %>% - dplyr::filter(metric %in% focal_metrics) - -DT::datatable(performance) -``` - - -### Number of records - -- Model performance was generally better for species with more observations -- Very poor performance below 50-100 observations - - -```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE} -df_plot = performance - -# Calculate regression lines for each model and metric combination -suppressWarnings({ - regression_lines = df_plot %>% - group_by(model, metric) %>% - group_modify(~asym_regression(.x$obs, .x$value)) -}) - -# Create base plot -plot <- plot_ly() %>% - layout( - title = "Model Performance vs. Number of observations", - xaxis = list(title = "Number of observations", type = "log"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -# Points -for (model_name in unique(df_plot$model)) { - plot = plot %>% - add_markers( - data = filter(df_plot, model == model_name), - x = ~obs, - y = ~value, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - opacity = 0.6, - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Observations:", obs, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -# Add regression lines -for(model_name in unique(df_plot$model)){ - reg_data = dplyr::filter(regression_lines, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~x, - y = ~fit, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(fit)'), - showlegend = FALSE, - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -bslib::card(plot, full_screen = T) -``` - - -### Range characteristics - -#### Range size - -Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016). - -- Model performance tended to be slightly higher for species with larger range size -- Only RF shows continuous performance improvements beyond range sizes of \~5M km² - - -```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE} - -df_join = range_maps %>% - dplyr::mutate(range_size = as.numeric(st_area(range_maps) / 1000000)) %>% # range in sqkm - sf::st_drop_geometry() - -df_plot = performance %>% - inner_join(df_join, by = c("species" = "name_matched")) - -# Calculate regression lines for each model and metric combination -suppressWarnings({ - regression_lines = df_plot %>% - group_by(model, metric) %>% - group_modify(~asym_regression(.x$range_size, .x$value)) -}) - -# Create base plot -plot <- plot_ly() %>% - layout( - title = "Model Performance vs. Range size", - xaxis = list(title = "Range size"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -# Add regression lines and confidence intervals for each model -for (model_name in unique(df_plot$model)) { - plot <- plot %>% - add_markers( - data = filter(df_plot, model == model_name), - x = ~range_size, - y = ~value, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - opacity = 0.6, - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range size:", range_size, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -for (model_name in unique(df_plot$model)) { - reg_data = dplyr::filter(regression_lines, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~x, - y = ~fit, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(fit)'), - showlegend = FALSE, - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -bslib::card(plot, full_screen = T) -``` - - -#### Range coverage - -Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells. - -$$ -RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}} -$$ - -- Models for species with higher range coverage showed slightly better performance - - -```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE} -df_cells_total = range_maps_gridded %>% - dplyr::rename("species" = name_matched) %>% - group_by(species) %>% - summarise(cells_total = n()) %>% - st_drop_geometry() - -df_cells_occ <- range_maps_gridded %>% - st_join(occs_final, join = st_intersects) %>% - filter(name_matched == species) %>% # Filter only intersections of the same species - group_by(species) %>% - summarise(cells_occupied = n_distinct(geometry)) %>% - st_drop_geometry() - -df_join = df_cells_total %>% - dplyr::inner_join(df_cells_occ, by = "species") %>% - dplyr::mutate(range_cov = cells_occupied / cells_total) %>% - dplyr::select(species, range_cov) - -df_plot = performance %>% - inner_join(df_join, by = "species") - -# Calculate regression lines for each model and metric combination -suppressWarnings({ - regression_lines = df_plot %>% - group_by(model, metric) %>% - group_modify(~lin_regression(.x$range_cov, .x$value)) -}) - -# Create base plot -plot <- plot_ly() %>% - layout( - title = "Model Performance vs. Range coverage", - xaxis = list(title = "Range coverage"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -# Add regression lines and confidence intervals for each model -for (model_name in unique(df_plot$model)) { - plot <- plot %>% - add_markers( - data = filter(df_plot, model == model_name), - x = ~range_cov, - y = ~value, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - opacity = 0.6, - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range coverage:", range_cov, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -for (model_name in unique(df_plot$model)) { - reg_data = dplyr::filter(regression_lines, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~x, - y = ~fit, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(fit)'), - showlegend = FALSE, - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -bslib::card(plot, full_screen = T) -``` - - -#### Range coverage bias - -Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range. - -$$ -RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)} -$$ - -Higher bias values indicate that occurrence records are spatially more clustered within the range of the species. - -- There was no strong relationship between range coverage bias and model performance - - -```{r range_coverage_bias, echo = FALSE, message=FALSE, warnings=FALSE} -df_occs_total = occs_final %>% - st_drop_geometry() %>% - group_by(species) %>% - summarise(occs_total = n()) - -df_join = df_occs_total %>% - dplyr::inner_join(df_cells_total, by = "species") %>% - dplyr::inner_join(df_cells_occ, by = "species") %>% - dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1))) - -df_plot = performance %>% - inner_join(df_join, by = "species") - -# Calculate regression lines for each model and metric combination -suppressWarnings({ - regression_lines = df_plot %>% - group_by(model, metric) %>% - group_modify(~lin_regression(.x$range_bias, .x$value)) -}) - -# Create base plot -plot <- plot_ly() %>% - layout( - title = "Model Performance vs. Range coverage bias", - xaxis = list(title = "Range coverage bias"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -# Add regression lines and confidence intervals for each model -for (model_name in unique(df_plot$model)) { - plot <- plot %>% - add_markers( - data = filter(df_plot, model == model_name), - x = ~range_bias, - y = ~value, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - opacity = 0.6, - name = ~model, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Range coverage bias:", range_bias, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - -for (model_name in unique(df_plot$model)) { - reg_data = dplyr::filter(regression_lines, model == model_name) - plot = plot %>% - add_lines( - data = reg_data, - x = ~x, - y = ~fit, - color = model_name, # Set color to match legendgroup - legendgroup = model_name, - name = paste(model_name, '(fit)'), - showlegend = FALSE, - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] - ) - ) - ) -} - - -bslib::card(plot, full_screen = T) -``` - - -### Functional group - -Functional groups were assigned based on taxonomic order. The following groupings were used: - -| Functional group | Taxomic orders | -|------------------------|------------------------------------------------| -| large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla | -| small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha | -| arboreal | Primates, Pilosa | -| flying | Chiroptera | - -- Models for bats tended to perform slightly worse than for other groups. - - -```{r functional_groups, echo = FALSE, message=FALSE, warnings=FALSE} -df_plot = performance %>% - dplyr::left_join(functional_groups, by = c("species" = "name_matched")) - -plot <- plot_ly( - data = df_plot, - x = ~functional_group, - y = ~value, - color = ~model, - type = 'box', - boxpoints = "all", - jitter = 1, - pointpos = 0, - hoverinfo = 'text', - text = ~paste("Species:", species, "<br>Functional group:", functional_group, "<br>Value:", round(value, 3)), - transforms = list( - list( - type = 'filter', - target = ~metric, - operation = '=', - value = focal_metrics[1] # default value - ) - ) -) - -plot <- plot %>% - layout( - title = "Model Performance vs. Functional Group", - xaxis = list(title = "Functional group"), - yaxis = list(title = "Value"), - legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot - margin = list(r = 150), # Add right margin to accommodate legend - hovermode = 'closest', - boxmode = "group", - updatemenus = list( - list( - type = "dropdown", - active = 0, - buttons = plotly_buttons - ) - ) - ) - -bslib::card(plot, full_screen = T) -``` - diff --git a/R/utils.R b/R/utils.R index 5539cfd5a9eed4da5e7aea07de93fac44ac676e2..edde20d758b1d33e81cf00ef13caa997496b6fa3 100644 --- a/R/utils.R +++ b/R/utils.R @@ -40,13 +40,6 @@ evaluate_model <- function(model, test_data) { # Predict probabilities if(class(model) %in% c("citodnn", "citodnnBootstrap")){ - # This is a hack to fix naming issue during initial training run. Fix for later runs: - # ---- - test_data = test_data %>% - dplyr::select(-species) %>% - dplyr::rename(species = species_int) - # ---- - probs <- predict(model, as.matrix(test_data), type = "response")[,1] preds <- factor(round(probs), levels = c("0", "1"), labels = c("A", "P")) } else {