Added more species to modeling pipeline, fixed some bugs and made some additions to analysis script

296d916c · ye87zine · 87abe201 · 296d916c · 296d916c · 296d916c
Commit 296d916c authored 9 months ago by ye87zine
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,6 @@ renv/staging/
 renv/cache/
 # Data files
-r_objects/
+data/
 R/performance_analysis_files
 R/performance_analysis.html
\ No newline at end of file
--- a/R/assign_functional_groups.R
+++ b/R/assign_functional_groups.R
+load("data/r_objects/range_maps.RData")
+# Get taxonomic information for target species
+names_unique = unique(range_maps$name_matched[!is.na(range_maps$name_matched)])
+species_matched = lapply(names_unique, function(name){
+  match_result = Symobio::gbif_match_name(name = name)
+  if(match_result$status != "ACCEPTED"){
+    match_result = gbif_match_name(usageKey = match_result$acceptedUsageKey)
+  }
+  match_result$name_orig = name
+  return(match_result)
+}) %>% 
+  bind_rows()
+# Assign functional groups
+functional_groups = species_matched %>% 
+  mutate(
+    functional_group = case_when(
+      order %in% c("Carnivora", "Artiodactyla", "Cingulata", "Perissodactyla") ~ 1,
+      order %in% c("Rodentia", "Didelphimorphia", "Soricomorpha", "Paucituberculata", "Lagomorpha") ~ 2,
+      order %in% c("Primates", "Pilosa") ~ 3,
+      order %in% c("Chiroptera") ~ 4
+    ),
+    functional_group = factor(functional_group, labels = c("large ground-dwelling", "small ground-dwelling", "arboreal", "flying"))
+  ) %>% 
+  dplyr::select(
+    name_orig,
+    name_matched = species,
+    functional_group
+  ) %>% 
+  distinct()
+save(functional_groups, file = "data/r_objects/functional_groups.RData")
--- a/R/install_torch.R
+++ b/R/install_torch.R
+# Installation
+options(timeout = 600) # increasing timeout is recommended since we will be downloading a 2GB file.
+# For Windows and Linux: "cpu", "cu117" are the only currently supported
+# For MacOS the supported are: "cpu-intel" or "cpu-m1"
+kind <- "cu117"
+version <- available.packages()["torch","Version"]
+options(repos = c(
+  torch = sprintf("https://torch-cdn.mlverse.org/packages/%s/%s/", kind, version),
+  CRAN = "https://cloud.r-project.org" # or any other from which you want to install the other R dependencies.
+))
+install.packages("torch", type = "binary")
+install.packages("torchvision")
+install.packages("luz")
+# Load packages
+library(torch)
+library(torchvision)
+library(luz)
+# Get dataset
+dir <- "../playground/dataset/mnist"
+train_ds <- mnist_dataset(
+  dir,
+  download = TRUE,
+  transform = transform_to_tensor
+)
+test_ds <- mnist_dataset(
+  dir,
+  train = FALSE,
+  transform = transform_to_tensor
+)
+train_dl <- dataloader(train_ds, batch_size = 128, shuffle = TRUE)
+test_dl <- dataloader(test_ds, batch_size = 128)
+# Plot example character
+image <- train_ds$data[1,1:28,1:28]
+image_df <- data.table::melt(image)
+ggplot(image_df, aes(x=Var2, y=Var1, fill=value))+
+  geom_tile(show.legend = FALSE) + 
+  xlab("") + ylab("") +
+  scale_fill_gradient(low="white", high="black")
+# Build network
+net <- nn_module(
+  "Net",
+  initialize = function() {
+    self$conv1 <- nn_conv2d(1, 32, 3, 1)
+    self$conv2 <- nn_conv2d(32, 64, 3, 1)
+    self$dropout1 <- nn_dropout2d(0.25)
+    self$dropout2 <- nn_dropout2d(0.5)
+    self$fc1 <- nn_linear(9216, 128)
+    self$fc2 <- nn_linear(128, 10)
+  },
+  forward = function(x) {
+    x %>%                                  # N * 1 * 28 * 28
+      self$conv1() %>%                     # N * 32 * 26 * 26
+      nnf_relu() %>%                       
+      self$conv2() %>%                     # N * 64 * 24 * 24
+      nnf_relu() %>% 
+      nnf_max_pool2d(2) %>%                # N * 64 * 12 * 12
+      self$dropout1() %>% 
+      torch_flatten(start_dim = 2) %>%     # N * 9216
+      self$fc1() %>%                       # N * 128
+      nnf_relu() %>% 
+      self$dropout2() %>% 
+      self$fc2()                           # N * 10
+  }
+)
+# Train
+fitted <- net %>%
+  setup(
+    loss = nn_cross_entropy_loss(),
+    optimizer = optim_adam,
+    metrics = list(
+      luz_metric_accuracy()
+    )
+  ) %>%
+  fit(train_dl, epochs = 10, valid_data = test_dl)
--- a/R/model_target_species.R
+++ b/R/model_target_species.R
@@ -19,13 +19,13 @@ library(pROC)
 library(cito)
 source("R/utils.R")
 con = db_connect()
 sf::sf_use_s2(use_s2 = FALSE)
 # ---------------------------------------------------------------------------#
 # Prepare Geodata                                                         ####
 # ---------------------------------------------------------------------------#
 raster_info = tbl(con, "datasets") %>% 
  dplyr::filter(stringr::str_detect(name, "CHELSA")) %>% 
  collect()
@@ -40,11 +40,11 @@ sa_polygon = rnaturalearth::ne_countries() %>%
 # ---------------------------------------------------------------------------#
 # Prepare Occurrence Data                                                 ####
 # ---------------------------------------------------------------------------#
-load("r_objects/test_species.RData")
+load("data/r_objects/range_maps.RData")
-test_species = unique(test_species$name_matched)
+target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)])
 occs = tbl(con, "species_occurrences") %>% 
-  dplyr::filter(species %in% test_species) %>% 
+  dplyr::filter(species %in% target_species) %>% 
  dplyr::select(-year) %>% 
  dplyr::distinct() %>% 
  collect() %>% 
@@ -81,10 +81,13 @@ occs_final = occs %>%
  inner_join(env_vars, by = "coordinate_id") %>% 
  dplyr::select(-coordinate_id)
-save(occs_final, file = "r_objects/occs_final.RData")
+save(occs_final, file = "data/r_objects/occs_final.RData")
 # ---------------------------------------------------------------------------#
 # Main loop                                                               ####
 # ---------------------------------------------------------------------------#
+load("data/r_objects/occs_final.RData")
+load("data/r_objects/sa_polygon.RData")
 occs_split = split(occs_final, occs_final$species)
 future::plan("multisession", workers = 16)
@@ -130,7 +133,7 @@ model_results = furrr::future_map(occs_split, .options = furrr::furrr_options(se
    terra::extract(sample_points) %>% 
    dplyr::select(-ID) %>% 
    dplyr::mutate(
-      presence = "absent",
+      presence = 0,
      geometry = sample_points$x
    ) %>% 
    tibble()
@@ -139,9 +142,11 @@ model_results = furrr::future_map(occs_split, .options = furrr::furrr_options(se
  # Create modeling dataset      ####
  # ------------------------------- #
  model_data = occs_spec %>% 
-    dplyr::mutate(presence = "present") %>% 
+    dplyr::mutate(presence = 1) %>% 
    bind_rows(abs_spec) %>% 
-    dplyr::mutate(presence = as.factor(presence))
+    dplyr::mutate(
+      presence_fct = as.factor(ifelse(presence == 0, "absent", "present"))
+    )
  # Define cross-validation folds
  spatial_folds = blockCV::cv_spatial(
@@ -162,10 +167,7 @@ model_results = furrr::future_map(occs_split, .options = furrr::furrr_options(se
  # Define predictor columns
  predictors = paste0("layer_", 1:19)
-  # ------------------------------- #
+  # Define empty performance summary
-  # Train models                 ####
-  # ------------------------------- #
-  # Preparation
  na_performance = list(    
    AUC = NA,
    Accuracy = NA,
@@ -175,18 +177,42 @@ model_results = furrr::future_map(occs_split, .options = furrr::furrr_options(se
    F1 = NA
  )
-  # Feature selection
+  # ------------------------------- #
-  # --> Very time consuming!
+  # Train models                 ####
+  # ------------------------------- #
-  # feature_selection = rfe(
+  ## cito ####
-  #   x = dplyr::select(model_data, contains("layer")),
+  # model_data_nn = model_data %>% 
-  #   y = model_data$presence,
+  #   dplyr::mutate(across(all_of(predictors), scale)) %>% 
-  #   rfeControl = rfeControl(functions = caretFuncs, 
+  #   select(-species, -longitude, -latitude)
-  #                           method = "cv", 
+  # 
-  #                           number = 3, # Number of folds
+  # train_data_nn = model_data_nn[train_index, ]
-  #                           verbose = T)
+  # test_data_nn  = model_data_nn[-train_index, ]
+  # 
+  # nn_fit = dnn(
+  #   Y = train_data_nn$presence,
+  #   X = as.matrix(train_data_nn[, predictors]),
+  #   hidden = c(500L, 500L),
+  #   loss = "binomial",
+  #   activation = "leaky_relu",
+  #   epochs = 2000L, 
+  #   lr = 0.02,    
+  #   dropout = 0.2,  # Regularization 
+  #   burnin = Inf,
+  #   optimizer = config_optimizer("adam", weight_decay = 0.001),
+  #   lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 10, factor = 0.7),
+  #   early_stopping = 500L, # stop training when validation loss does not decrease anymore
+  #   validation = 0.2, # used for early stopping and lr_scheduler 
+  #   device = "cpu", 
+  #   bootstrap = 50L
  # )
+  # 
+  # nn_fit$successfull
+  # preds = predict(nn_fit, type = "response") # --> Strange discrete steps of size 1/bootstrap_value
+  # plot(preds)
+  # Metrics::auc(train_data_nn$presence, round(preds[,1]))
+  ## caret ####  
+  # Define training routine
  index_train = lapply(unique(sort(train_data$fold)), function(x){
    return(which(train_data$fold != x))
  })
@@ -207,7 +233,7 @@ model_results = furrr::future_map(occs_split, .options = furrr::furrr_options(se
    rf_fit = caret::train(
      x = train_data[, predictors],
-      y = train_data$presence,
+      y = train_data$presence_fct,
      method = "rf",
      metric = "ROC",
      tuneGrid = rf_grid,
@@ -229,7 +255,7 @@ model_results = furrr::future_map(occs_split, .options = furrr::furrr_options(se
    gbm_fit = train(
      x = train_data[, predictors],
-      y = train_data$presence,
+      y = train_data$presence_fct,
      method = "gbm",
      metric = "ROC",
      verbose = F,
@@ -245,7 +271,7 @@ model_results = furrr::future_map(occs_split, .options = furrr::furrr_options(se
  glm_performance = tryCatch({
    glm_fit = train(
      x = train_data[, predictors],
-      y = train_data$presence,
+      y = train_data$presence_fct,
      method = "glm",
      family=binomial, 
      metric = "ROC",
@@ -273,4 +299,4 @@ model_results = furrr::future_map(occs_split, .options = furrr::furrr_options(se
  return(performance_summary)
 })
-save(model_results, file = "r_objects/model_results.RData")
+save(model_results, file = "data/r_objects/model_results.RData")
--- a/R/performance_analysis.qmd
+++ b/R/performance_analysis.qmd
@@ -11,17 +11,17 @@ library(sf)
 library(plotly)
 library(DT)
-load("../r_objects/model_results.RData")
+load("../data/r_objects/model_results.RData")
-load("../r_objects/test_species.RData")
+load("../data/r_objects/range_maps.RData")
-load("../r_objects/range_maps.RData")
+load("../data/r_objects/range_maps_gridded.RData")
-load("../r_objects/range_maps_gridded.RData")
+load("../data/r_objects/occs_final.RData")
-load("../r_objects/occs_final.RData")
+load("../data/r_objects/functional_groups.RData")
 sf::sf_use_s2(use_s2 = FALSE)
 ```
 ## Summary
-This document summarizes the performance of three SDM algorithms (Random Forest, Gradient Boosting Machine, Generalized Linear Model) for 96 South American terrestrial mammal species. We use six metrics (AUC, F1, kappa, accuracy, precision, and recall) to evaluate model performance and look at how performance varies with five factors (number of records, range size, range occupancy, spatial dispersion, and functional group).
+This document summarizes the performance of three SDM algorithms (Random Forest, Gradient Boosting Machine, Generalized Linear Model) for `{r} length(model_results)` South American mammal species. We use six metrics (AUC, F1, kappa, accuracy, precision, and recall) to evaluate model performance and look at how performance varies with five factors (number of records, range size, range coverage, range coverage bias, and functional group).
 Modeling decisions:
@@ -34,12 +34,13 @@ Modeling decisions:
 Key findings:
- RF and GBM models generally performed better than GLM across metrics
+-   RF performed best, GBM slightly worse, GLM worst
- More occurrence records and larger range sizes tended to improve model accuracy
+-   More occurrence records and larger range sizes tended to improve model performance
- Higher range occupancy correlated with better performance.
+-   Higher range coverage correlated with better performance.
- Spatial dispersion and functional group showed some impact but were less consistent
+-   Range coverage bias and functional group showed some impact but were less consistent <!-- TODO: check after rerun -->
 ## Analysis
 The table below shows the analysed modeling results.
 ```{r performance, echo = FALSE, message=FALSE, warnings=FALSE}
@@ -50,7 +51,7 @@ performance = model_results %>%
  dplyr::filter(!is.na(value)) %>% 
  dplyr::mutate(
    metric = factor(metric, levels = c("auc", "kappa", "f1", "accuracy", "precision", "recall")),
-    value = pmax(value, 0, na.rm = T) # Fix one weird instance of f1 < 0
+    value = round(pmax(value, 0, na.rm = T), 3) # Fix one weird instance of f1 < 0
  ) 
 DT::datatable(performance)
@@ -59,7 +60,7 @@ DT::datatable(performance)
 ### Number of records
 -   Model performance was generally better for species with more observations
-   No major improvements in model performance beyond \~500 observations
+-   Very poor performance below 50-100 observations
 ```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE}
 df_plot = performance
@@ -89,7 +90,7 @@ plot <- plot_ly(
 plot <- plot %>%
  layout(
    title = "Model Performance vs. Number of observations",
-    xaxis = list(title = "Number of observations"),
+    xaxis = list(title = "Number of observations", type = "log"),
    yaxis = list(title = "Value"),
    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
    margin = list(r = 150),  # Add right margin to accommodate legend
@@ -115,12 +116,12 @@ bslib::card(plot, full_screen = T)
 ### Range characteristics
-The spatial coverage of species occurrences was evaluated using a systematic hexagonal grid system, with each cell spanning 1 degree in both latitudinal and longitudinal dimensions. Two complementary metrics were used to assess range coverage: First, sampling completeness was quantified by determining the number of grid cells containing at least one occurrence record (*occupancy*). Second, we calculated the theoretical maximum number of occupied hexagons possible given the total number of observations (*spatial dispersion*).
 #### Range size
-   Model performance was lower for small ranged species
+Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016).
-   No major improvements in model performance beyond range size of \~3M km²
+-   Model performance tended to be slightly higher for species with larger range size
+-   Only RF shows continuous performance improvements beyond range sizes of \~5M km²
 ```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE}
 df_join = range_maps %>% 
@@ -179,11 +180,17 @@ plot <- plot %>%
 bslib::card(plot, full_screen = T)
 ```
-#### Occupancy
+#### Range coverage
+Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells.
+$$
+RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}}
+$$
-   Models for species with higher range occupancy showed slightly better performance
+-   Models for species with higher range coverage showed slightly better performance
-```{r range_occupancy, echo = FALSE, message=FALSE, warnings=FALSE}
+```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE}
 df_cells_total = range_maps_gridded %>%
  dplyr::rename("species" = name_matched) %>% 
  group_by(species) %>%
@@ -199,8 +206,8 @@ df_cells_occ <- range_maps_gridded %>%
 df_join = df_cells_total %>% 
  dplyr::inner_join(df_cells_occ, by = "species") %>% 
-  dplyr::mutate(occupancy = cells_occupied / cells_total) %>% 
+  dplyr::mutate(range_cov = cells_occupied / cells_total) %>% 
-  dplyr::select(species, occupancy)
+  dplyr::select(species, range_cov)
 df_plot = performance %>% 
  inner_join(df_join, by = "species")
@@ -208,14 +215,14 @@ df_plot = performance %>%
 # Create base plot
 plot <- plot_ly(
  data = df_plot,
-  x = ~occupancy,
+  x = ~range_cov,
  y = ~value,
  color = ~model,
  type = 'scatter',
  mode = 'markers',
  name = ~model,
  hoverinfo = 'text',
-  text = ~paste("Species:", species, "<br>Range occupancy:", round(occupancy, 3), "<br>Value:", round(value, 3)),
+  text = ~paste("Species:", species, "<br>Range coverage:", round(range_cov, 3), "<br>Value:", round(value, 3)),
  transforms = list(
    list(
      type = 'filter',
@@ -229,8 +236,8 @@ plot <- plot_ly(
 # Add dropdown for selecting metric
 plot <- plot %>%
  layout(
-    title = "Model Performance vs. Range occupancy",
+    title = "Model Performance vs. Range coverage",
-    xaxis = list(title = "Range occupancy"),
+    xaxis = list(title = "Range coverage"),
    yaxis = list(title = "Value"),
    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
    margin = list(r = 150),  # Add right margin to accommodate legend
@@ -254,13 +261,19 @@ plot <- plot %>%
 bslib::card(plot, full_screen = T)
 ```
-#### Spatial dispersion
+#### Range coverage bias
-   Models for species with higher spatial dispersion tended to exhibit lower performance
+Range coverage bias was calculated as the as the minimum of total grid cells and total occurrences divided by the number of occupied cells.
-   Large spread, weak relationship
+$$
+RangeCoverageBias = \frac{min(N_{cells\_total}, N_{obs\_total})}{N_{cells\_occupied}}
+$$
-```{r spatial_dispersion, echo = FALSE, message=FALSE, warnings=FALSE}
+Higher bias values indicate that occurrence records are spatially more clustered within the range of the species.
+-   There was no strong relationship between range coverage bias and model performance
+```{r range_coverage_bias, echo = FALSE, message=FALSE, warnings=FALSE}
 df_occs_total = occs_final %>% 
  st_drop_geometry() %>% 
  group_by(species) %>% 
@@ -269,7 +282,7 @@ df_occs_total = occs_final %>%
 df_join = df_occs_total %>% 
  dplyr::inner_join(df_cells_total, by = "species") %>% 
  dplyr::inner_join(df_cells_occ, by = "species") %>% 
-  dplyr::mutate(dispersion = cells_occupied / pmin(cells_total, occs_total))
+  dplyr::mutate(range_bias = pmin(cells_total, occs_total) / cells_occupied)
 df_plot = performance %>% 
  inner_join(df_join, by = "species")
@@ -277,14 +290,14 @@ df_plot = performance %>%
 # Create base plot
 plot <- plot_ly(
  data = df_plot,
-  x = ~dispersion,
+  x = ~range_bias,
  y = ~value,
  color = ~model,
  type = 'scatter',
  mode = 'markers',
  name = ~model,
  hoverinfo = 'text',
-  text = ~paste("Species:", species, "<br>Dispersion:", round(dispersion, 3), "<br>Value:", round(value, 3)),
+  text = ~paste("Species:", species, "<br>Range coverage bias:", round(range_bias, 3), "<br>Value:", round(value, 3)),
  transforms = list(
    list(
      type = 'filter',
@@ -298,8 +311,8 @@ plot <- plot_ly(
 # Add dropdown for selecting metric
 plot <- plot %>%
  layout(
-    title = "Model Performance vs. Dispersion",
+    title = "Model Performance vs. Range coverage bias",
-    xaxis = list(title = "Dispersion"),
+    xaxis = list(title = "Range coverage bias", type = "log"),
    yaxis = list(title = "Value"),
    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
    margin = list(r = 150),  # Add right margin to accommodate legend
@@ -325,15 +338,20 @@ bslib::card(plot, full_screen = T)
 ### Functional group
-   No major difference in model performance among functional groups
+Functional groups were assigned based on taxonomic order. The following groupings were used:
-```{r functional_groups, echo = FALSE, message=FALSE, warnings=FALSE}
+| Functional group      | Taxomic orders                                                        |
-df_join = test_species %>% 
+|-----------------------|-----------------------------------------------------------------------|
-  dplyr::distinct(name_matched, functional_group) %>% 
+| large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla                    |
-  dplyr::mutate(functional_group = factor(functional_group, labels = c("arboreal", "large ground-dwelling", "small ground-dwelling")))
+| small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha |
+| arboreal              | Primates, Pilosa                                                      |
+| flying                | Chiroptera                                                            |
+-   Models for bats tended to perform slightly worse than for other groups.
+```{r functional_groups, echo = FALSE, message=FALSE, warnings=FALSE}
 df_plot = performance %>% 
-  dplyr::left_join(df_join, by = c("species" = "name_matched"))
+  dplyr::left_join(functional_groups, by = c("species" = "name_matched"))
 plot <- plot_ly(
  data = df_plot,
@@ -359,7 +377,7 @@ plot <- plot_ly(
 plot <- plot %>%
  layout(
    title = "Model Performance vs. Functional Group",
-    xaxis = list(title = "Range occupancy"),
+    xaxis = list(title = "Functional group"),
    yaxis = list(title = "Value"),
    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
    margin = list(r = 150),  # Add right margin to accommodate legend

--- a/R/process_range_maps.R
+++ b/R/process_range_maps.R
@@ -3,8 +3,6 @@ library(Symobio)
 library(sf)
 library(rnaturalearth)
-load("r_objects/test_species.RData")
 sf::sf_use_s2(use_s2 = FALSE)
 # Load range maps
@@ -15,13 +13,16 @@ range_maps = st_read(
 ) %>% 
  dplyr::filter(!legend %in% c("Extinct", "Not Mapped"))
-# Load South America polygon
+# Find species that intersect with Amazon
-sa_polygon = rnaturalearth::ne_countries() %>% 
+amazon = st_read(
-  dplyr::filter(continent == "South America") %>% 
+  "data/geospatial/amazonia_polygons/amazonia_polygons.shp",
-  sf::st_union()
+  geometry_column = "geometry",
+  promote_to_multi = T
+) 
+range_maps = st_filter(range_maps, amazon)
 # Match names against GBIF backbone
-maps_names_matched = lapply(unique(range_maps$binomial), function(name){
+range_maps_names_matched = lapply(unique(range_maps$binomial), function(name){
  tryCatch({
    match_result = Symobio::gbif_match_name(name = name)
    if(match_result$status != "ACCEPTED"){
@@ -34,25 +35,25 @@ maps_names_matched = lapply(unique(range_maps$binomial), function(name){
    return(NULL)
  })
 })
+save(range_maps_names_matched, file = "data/r_objects/range_maps_names_matched.RData")
-save(range_maps_names_matched, file = "r_objects/range_maps_names_matched.RData")
 # Subset range maps to target species and focal region
-names_subset = Filter(is.data.frame, range_maps_names_matched) %>% 
+names_matched = Filter(is.data.frame, range_maps_names_matched) %>% 
-  bind_rows() %>% 
+  bind_rows()
-  dplyr::filter(name_matched %in% test_species$name_matched)
 range_maps = range_maps %>% 
-  inner_join(names_subset, by = c("binomial" = "name_orig")) %>% 
+  inner_join(names_matched, by = c("binomial" = "name_orig")) %>% 
  group_by(name_matched) %>% 
-  summarize(geometry = suppressMessages(st_union(geometry))) %>% 
+  summarize(geometry = suppressMessages(st_union(geometry))) 
-  st_intersection(sa_polygon)
-save(range_maps, file = "r_objects/range_maps.RData")
+save(range_maps, file = "data/r_objects/range_maps.RData")
 # Gridded range maps
-range_maps_gridded = st_make_grid(sa_polygon, square = FALSE, cellsize = 1) %>% 
+range_maps_gridded = rnaturalearth::ne_countries() %>% 
+  dplyr::filter(continent == "South America") %>% 
+  sf::st_union() %>% 
+  st_make_grid(square = FALSE, cellsize = 1) %>% 
  st_sf() %>% 
  st_join(range_maps, st_intersects, left = F)
-save(range_maps_gridded, file = "r_objects/range_maps_gridded.RData")
+save(range_maps_gridded, file = "data/r_objects/range_maps_gridded.RData")
--- a/R/utils.R
+++ b/R/utils.R
@@ -28,7 +28,7 @@ evaluate_model <- function(model, test_data, threshold = 0.5) {
  # Predict probabilities
  probs <- predict(model, test_data, type = "prob")$present
  preds <- predict(model, test_data, type = "raw")
-  actual <- test_data$presence
+  actual <- test_data$presence_fct
  # Calculate AUC
  auc <- pROC::roc(actual, probs, levels = c("present", "absent"), direction = ">")$auc

--- a/renv.lock
+++ b/renv.lock
 {
  "R": {
-    "Version": "4.2.2",
+    "Version": "4.3.0",
    "Repositories": [
      {
        "Name": "CRAN",
@@ -55,7 +55,7 @@
    },
    "MASS": {
      "Package": "MASS",
-      "Version": "7.3-58",
+      "Version": "7.3-58.4",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@@ -66,11 +66,11 @@
        "stats",
        "utils"
      ],
-      "Hash": "4bf2c61e656bbc238634c798c208d386"
+      "Hash": "a3142b2a022b8174ca675bc8b80cdc4e"
    },
    "Matrix": {
      "Package": "Matrix",
-      "Version": "1.5-1",
+      "Version": "1.5-4",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@@ -82,7 +82,7 @@
        "stats",
        "utils"
      ],
-      "Hash": "539dc0c0c05636812f1080f473d2c177"
+      "Hash": "e779c7d9f35cc364438578f334cffee2"
    },
    "ModelMetrics": {
      "Package": "ModelMetrics",
@@ -242,7 +242,7 @@
    },
    "boot": {
      "Package": "boot",
-      "Version": "1.3-28",
+      "Version": "1.3-28.1",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@@ -250,7 +250,7 @@
        "graphics",
        "stats"
      ],
-      "Hash": "0baa960e3b49c6176a4f42addcbacc59"
+      "Hash": "9a052fbcbe97a98ceb18dbfd30ebd96e"
    },
    "broom": {
      "Package": "broom",
@@ -391,7 +391,7 @@
    },
    "class": {
      "Package": "class",
-      "Version": "7.3-20",
+      "Version": "7.3-21",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@@ -400,7 +400,7 @@
        "stats",
        "utils"
      ],
-      "Hash": "da09d82223e669d270e47ed24ac8686e"
+      "Hash": "8ae0d4328e2eb3a582dfd5391a3663b7"
    },
    "classInt": {
      "Package": "classInt",
@@ -457,13 +457,13 @@
    },
    "codetools": {
      "Package": "codetools",
-      "Version": "0.2-18",
+      "Version": "0.2-19",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
        "R"
      ],
-      "Hash": "019388fc48e48b3da0d3a76ff94608a8"
+      "Hash": "c089a619a7fae175d149d89164f8c7d8"
    },
    "colorspace": {
      "Package": "colorspace",
@@ -1226,7 +1226,7 @@
    },
    "lattice": {
      "Package": "lattice",
-      "Version": "0.20-45",
+      "Version": "0.21-8",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@@ -1237,7 +1237,7 @@
        "stats",
        "utils"
      ],
-      "Hash": "b64cdbb2b340437c4ee047a1f4c4377b"
+      "Hash": "0b8a6d63c8770f02a8b5635f3c431e6b"
    },
    "lava": {
      "Package": "lava",
@@ -1355,7 +1355,7 @@
    },
    "mgcv": {
      "Package": "mgcv",
-      "Version": "1.8-41",
+      "Version": "1.8-42",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@@ -1368,7 +1368,7 @@
        "stats",
        "utils"
      ],
-      "Hash": "6b3904f13346742caa3e82dd0303d4ad"
+      "Hash": "3460beba7ccc8946249ba35327ba902a"
    },
    "mime": {
      "Package": "mime",
@@ -1421,7 +1421,7 @@
    },
    "nlme": {
      "Package": "nlme",
-      "Version": "3.1-160",
+      "Version": "3.1-162",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@@ -1431,7 +1431,7 @@
        "stats",
        "utils"
      ],
-      "Hash": "02e3c6e7df163aafa8477225e6827bc5"
+      "Hash": "0984ce8da8da9ead8643c5cbbb60f83e"
    },
    "nloptr": {
      "Package": "nloptr",
@@ -1790,13 +1790,13 @@
    },
    "renv": {
      "Package": "renv",
-      "Version": "1.0.5",
+      "Version": "1.0.11",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
        "utils"
      ],
-      "Hash": "32c3f93e8360f667ca5863272ec8ba6a"
+      "Hash": "47623f66b4e80b3b0587bc5d7b309888"
    },
    "reprex": {
      "Package": "reprex",
@@ -2100,7 +2100,7 @@
    },
    "survival": {
      "Package": "survival",
-      "Version": "3.4-0",
+      "Version": "3.5-5",
      "Source": "Repository",
      "Repository": "CRAN",
      "Requirements": [
@@ -2112,7 +2112,7 @@
        "stats",
        "utils"
      ],
-      "Hash": "04411ae66ab4659230c067c32966fc20"
+      "Hash": "d683341b1fa2e8d817efde27d6e6d35b"
    },
    "sys": {
      "Package": "sys",

--- a/renv/activate.R
+++ b/renv/activate.R
@@ -2,10 +2,12 @@
 local({
  # the requested version of renv
-  version <- "1.0.5"
+  version <- "1.0.11"
  attr(version, "sha") <- NULL
  # the project directory
+  project <- Sys.getenv("RENV_PROJECT")
+  if (!nzchar(project))
    project <- getwd()
  # use start-up diagnostics if enabled
@@ -96,6 +98,66 @@ local({
    unloadNamespace("renv")
  # load bootstrap tools   
+  ansify <- function(text) {
+    if (renv_ansify_enabled())
+      renv_ansify_enhanced(text)
+    else
+      renv_ansify_default(text)
+  }
+  renv_ansify_enabled <- function() {
+    override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA)
+    if (!is.na(override))
+      return(as.logical(override))
+    pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA)
+    if (identical(pane, "build"))
+      return(FALSE)
+    testthat <- Sys.getenv("TESTTHAT", unset = "false")
+    if (tolower(testthat) %in% "true")
+      return(FALSE)
+    iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false")
+    if (tolower(iderun) %in% "false")
+      return(FALSE)
+    TRUE
+  }
+  renv_ansify_default <- function(text) {
+    text
+  }
+  renv_ansify_enhanced <- function(text) {
+    # R help links
+    pattern <- "`\\?(renv::(?:[^`])+)`"
+    replacement <- "`\033]8;;ide:help:\\1\a?\\1\033]8;;\a`"
+    text <- gsub(pattern, replacement, text, perl = TRUE)
+    # runnable code
+    pattern <- "`(renv::(?:[^`])+)`"
+    replacement <- "`\033]8;;ide:run:\\1\a\\1\033]8;;\a`"
+    text <- gsub(pattern, replacement, text, perl = TRUE)
+    # return ansified text
+    text
+  }
+  renv_ansify_init <- function() {
+    envir <- renv_envir_self()
+    if (renv_ansify_enabled())
+      assign("ansify", renv_ansify_enhanced, envir = envir)
+    else
+      assign("ansify", renv_ansify_default, envir = envir)
+  }
  `%||%` <- function(x, y) {
    if (is.null(x)) y else x
  }
@@ -129,6 +191,24 @@ local({
  }
+  heredoc <- function(text, leave = 0) {
+    # remove leading, trailing whitespace
+    trimmed <- gsub("^\\s*\\n|\\n\\s*$", "", text)
+    # split into lines
+    lines <- strsplit(trimmed, "\n", fixed = TRUE)[[1L]]
+    # compute common indent
+    indent <- regexpr("[^[:space:]]", lines)
+    common <- min(setdiff(indent, -1L)) - leave
+    text <- paste(substring(lines, common), collapse = "\n")
+    # substitute in ANSI links for executable renv code
+    ansify(text)
+  }
  startswith <- function(string, prefix) {
    substring(string, 1, nchar(prefix)) == prefix
  }
@@ -288,8 +368,11 @@ local({
      quiet    = TRUE
    )
-    if ("headers" %in% names(formals(utils::download.file)))
+    if ("headers" %in% names(formals(utils::download.file))) {
-      args$headers <- renv_bootstrap_download_custom_headers(url)
+      headers <- renv_bootstrap_download_custom_headers(url)
+      if (length(headers) && is.character(headers))
+        args$headers <- headers
+    }
    do.call(utils::download.file, args)
@@ -368,10 +451,21 @@ local({
    for (type in types) {
      for (repos in renv_bootstrap_repos()) {
+        # build arguments for utils::available.packages() call
+        args <- list(type = type, repos = repos)
+        # add custom headers if available -- note that
+        # utils::available.packages() will pass this to download.file()
+        if ("headers" %in% names(formals(utils::download.file))) {
+          headers <- renv_bootstrap_download_custom_headers(repos)
+          if (length(headers) && is.character(headers))
+            args$headers <- headers
+        }
        # retrieve package database
        db <- tryCatch(
          as.data.frame(
-            utils::available.packages(type = type, repos = repos),
+            do.call(utils::available.packages, args),
            stringsAsFactors = FALSE
          ),
          error = identity
@@ -453,6 +547,14 @@ local({
  }
+  renv_bootstrap_github_token <- function() {
+    for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) {
+      envval <- Sys.getenv(envvar, unset = NA)
+      if (!is.na(envval))
+        return(envval)
+    }
+  }
  renv_bootstrap_download_github <- function(version) {
    enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE")
@@ -460,16 +562,16 @@ local({
      return(FALSE)
    # prepare download options
-    pat <- Sys.getenv("GITHUB_PAT")
+    token <- renv_bootstrap_github_token()
-    if (nzchar(Sys.which("curl")) && nzchar(pat)) {
+    if (nzchar(Sys.which("curl")) && nzchar(token)) {
      fmt <- "--location --fail --header \"Authorization: token %s\""
-      extra <- sprintf(fmt, pat)
+      extra <- sprintf(fmt, token)
      saved <- options("download.file.method", "download.file.extra")
      options(download.file.method = "curl", download.file.extra = extra)
      on.exit(do.call(base::options, saved), add = TRUE)
-    } else if (nzchar(Sys.which("wget")) && nzchar(pat)) {
+    } else if (nzchar(Sys.which("wget")) && nzchar(token)) {
      fmt <- "--header=\"Authorization: token %s\""
-      extra <- sprintf(fmt, pat)
+      extra <- sprintf(fmt, token)
      saved <- options("download.file.method", "download.file.extra")
      options(download.file.method = "wget", download.file.extra = extra)
      on.exit(do.call(base::options, saved), add = TRUE)
@@ -631,6 +733,9 @@ local({
    # if the user has requested an automatic prefix, generate it
    auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA)
+    if (is.na(auto) && getRversion() >= "4.4.0")
+      auto <- "TRUE"
    if (auto %in% c("TRUE", "True", "true", "1"))
      return(renv_bootstrap_platform_prefix_auto())
@@ -822,24 +927,23 @@ local({
    # the loaded version of renv doesn't match the requested version;
    # give the user instructions on how to proceed
-    remote <- if (!is.null(description[["RemoteSha"]])) {
+    dev <- identical(description[["RemoteType"]], "github")
+    remote <- if (dev)
      paste("rstudio/renv", description[["RemoteSha"]], sep = "@")
-    } else {
+    else
      paste("renv", description[["Version"]], sep = "@")
-    }
    # display both loaded version + sha if available
    friendly <- renv_bootstrap_version_friendly(
      version = description[["Version"]],
-      sha     = description[["RemoteSha"]]
+      sha     = if (dev) description[["RemoteSha"]]
    )
-    fmt <- paste(
+    fmt <- heredoc("
-      "renv %1$s was loaded from project library, but this project is configured to use renv %2$s.",
+      renv %1$s was loaded from project library, but this project is configured to use renv %2$s.
-      "- Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.",
+      - Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.
-      "- Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.",
+      - Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.
-      sep = "\n"
+    ")
-    )
    catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote)
    FALSE