Code owners
Assign users and groups as approvers for specific file changes. Learn more.
03_01_modelling_ssdm.R 9.13 KiB
library(dplyr)
library(tidyr)
library(sf)
library(caret)
library(cito)
library(pROC)
library(parallel)
library(doParallel)
library(foreach)
source("R/utils.R")
load("data/r_objects/model_data.RData")
species_processed = list.files("data/r_objects/ssdm_results/performance/", pattern = "RData") %>%
stringr::str_remove(".RData")
data_split = model_data %>%
sf::st_drop_geometry() %>%
dplyr::filter(!species %in% species_processed) %>%
dplyr::group_by(species) %>%
dplyr::group_split()
# ----------------------------------------------------------------------#
# Set up parallel processing ####
# ----------------------------------------------------------------------#
num_cores <- 2 # Some packages use internal parallelization, so be careful with increasing this param
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# ----------------------------------------------------------------------#
# Run training ####
# ----------------------------------------------------------------------#
# Use foreach for parallel processing across species
foreach(pa_spec = data_split, .packages = c("dplyr", "caret", "cito", "pROC", "tidyr")) %dopar% {
na_performance = list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_,
precision = NA_real_, recall = NA_real_, f1 = NA_real_,
tp = NA_real_, fp = NA_real_, tn = NA_real_, fn = NA_real_)
predictors = paste0("bio", 1:19)
species = pa_spec$species[1]
print(species)
pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P"))
folds = sort(unique(na.omit(pa_spec$fold_global)))
# Outer CV loop (for averaging performance metrics)
performance_cv = lapply(folds, function(fold){
print(paste("Fold", fold))
## Preparations #####
data_train = dplyr::filter(pa_spec, record_type == "background" | fold_global != fold) # include background samples
data_test = dplyr::filter(pa_spec, fold_global == fold)
if(all(data_train$present == 0)){ # If only absences in training dataset --> return
return(NULL)
}
train_ctrl = caret::trainControl(
search = "random",
classProbs = TRUE,
number = 5,
summaryFunction = caret::twoClassSummary,
savePredictions = "final"
)
## Random Forest #####
rf_performance = tryCatch({
# Fit model
rf_fit = caret::train(
x = data_train[, predictors],
y = data_train$present_fct,
method = "ranger",
trControl = train_ctrl,
tuneLength = 8,
weights = data_train$weight,
verbose = F
)
evaluate_model(rf_fit, data_test)
}, error = function(e){
na_performance
})
## Gradient Boosted Machine ####
gbm_performance = tryCatch({
gbm_fit = train(
x = data_train[, predictors],
y = data_train$present_fct,
method = "gbm",
trControl = train_ctrl,
tuneLength = 8,
weights = data_train$weight,
verbose = F
)
evaluate_model(gbm_fit, data_test)
}, error = function(e){
na_performance
})
## Generalized Additive Model ####
gam_performance = tryCatch({
gam_fit = train(
x = data_train[, predictors],
y = data_train$present_fct,
method = "gamSpline",
tuneLength = 8,
weights = data_train$weight,
trControl = train_ctrl
)
evaluate_model(gam_fit, data_test)
}, error = function(e){
na_performance
})
## Neural Network ####
formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+')))
nn_performance = tryCatch({
nn_fit = dnn(
formula,
data = data_train,
hidden = c(200L, 200L, 200L),
loss = "binomial",
epochs = 200L,
burnin = 200L,
early_stopping = 25,
lr = 0.001,
batchsize = min(ceiling(nrow(data_train)/10), 64),
dropout = 0.25,
optimizer = config_optimizer("adam"),
validation = 0.2,
device = "cuda",
verbose = F,
plot = F
)
evaluate_model(nn_fit, data_test)
}, error = function(e){
na_performance
})
# Summarize results
performance_summary = tibble(
species = !!species,
obs = nrow(data_train),
fold_global = fold,
model = c("rf", "gbm", "gam", "nn"),
auc = c(rf_performance$auc, gbm_performance$auc, gam_performance$auc, nn_performance$auc),
accuracy = c(rf_performance$accuracy, gbm_performance$accuracy, gam_performance$accuracy, nn_performance$accuracy),
kappa = c(rf_performance$kappa, gbm_performance$kappa, gam_performance$kappa, nn_performance$kappa),
precision = c(rf_performance$precision, gbm_performance$precision, gam_performance$precision, nn_performance$precision),
recall = c(rf_performance$recall, gbm_performance$recall, gam_performance$recall, nn_performance$recall),
f1 = c(rf_performance$f1, gbm_performance$f1, gam_performance$f1, nn_performance$f1),
tp = c(rf_performance$tp, gbm_performance$tp, gam_performance$tp, nn_performance$tp),
fp = c(rf_performance$fp, gbm_performance$fp, gam_performance$fp, nn_performance$fp),
tn = c(rf_performance$tn, gbm_performance$tn, gam_performance$tn, nn_performance$tn),
fn = c(rf_performance$fn, gbm_performance$fn, gam_performance$fn, nn_performance$fn)
) %>%
tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value")
return(performance_summary)
})
# Combine and save evaluation results
performance_spec = bind_rows(performance_cv)
if(nrow(performance_spec) != 0){
performance_spec = performance_spec %>%
dplyr::arrange(fold_global, model, metric)
save(performance_spec, file = paste0("data/r_objects/ssdm_results/performance/", species, ".RData"))
}
}
# Stop the parallel cluster when done
stopCluster(cl)
# Combine results
files = list.files("data/r_objects/ssdm_results/performance/", full.names = T, pattern = ".RData")
ssdm_performance = lapply(files, function(f){load(f); return(performance_spec)}) %>%
bind_rows()
save(ssdm_performance, file = "data/r_objects/ssdm_performance.RData")
# ----------------------------------------------------------------------#
# Train full models ####
# ----------------------------------------------------------------------#
data_split = model_data %>%
dplyr::mutate(present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))) %>%
dplyr::group_by(species) %>%
dplyr::group_split()
for(pa_spec in data_split){
## Preparations ####
species = pa_spec$species[1]
print(as.character(species))
# Create inner CV folds for model training
cv_train = blockCV::cv_spatial(
pa_spec,
column = "present",
k = 5,
progress = F, plot = F, report = F
)
pa_spec$fold_train = cv_train$folds_ids
# Drop geometry
pa_spec$geometry = NULL
# Define caret training routine
index_train = lapply(unique(sort(pa_spec$fold_train)), function(x){
return(which(pa_spec$fold_train != x))
})
train_ctrl = caret::trainControl(
search = "random",
classProbs = TRUE,
index = index_train,
summaryFunction = caret::twoClassSummary,
savePredictions = "final"
)
# Define predictors
predictors = paste0("bio", 1:19)
# Random Forest ####
try({
rf_fit = caret::train(
x = pa_spec[, predictors],
y = pa_spec$present_fct,
method = "ranger",
trControl = train_ctrl,
tuneLength = 8,
weights = pa_spec$weight,
verbose = F
)
save(rf_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_rf_fit.RData"))
})
# Gradient Boosted Machine ####
try({
gbm_fit = train(
x = pa_spec[, predictors],
y = pa_spec$present_fct,
method = "gbm",
trControl = train_ctrl,
tuneLength = 8,
weights = pa_spec$weight,
verbose = F
)
save(gbm_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gbm_fit.RData"))
})
# Generalized Additive Model ####
try({
gam_fit = train(
x = pa_spec[, predictors],
y = pa_spec$present_fct,
method = "gamSpline",
tuneLength = 8,
weights = pa_spec$weight,
trControl = train_ctrl
)
save(gam_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gam_fit.RData"))
})
# Neural Network ####
try({
formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+')))
nn_fit = dnn(
formula,
data = pa_spec,
hidden = c(200L, 200L, 200L),
loss = "binomial",
epochs = 200L,
burnin = 200L,
early_stopping = 25,
lr = 0.001,
batchsize = min(ceiling(nrow(pa_spec)/10), 64),
dropout = 0.25,
optimizer = config_optimizer("adam"),
validation = 0.2,
device = "cuda",
verbose = F,
plot = F
)
save(nn_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_nn_fit.RData"))
})
}