Code owners
Assign users and groups as approvers for specific file changes. Learn more.
04_01_modelling_ssdm.R 8.66 KiB
library(furrr)
library(dplyr)
library(tidyr)
library(sf)
library(caret)
library(cito)
library(pROC)
source("R/utils.R")
load("data/r_objects/model_data.RData")
# ----------------------------------------------------------------------#
# Run training ####
# ----------------------------------------------------------------------#
species_processed = list.files("data/r_objects/ssdm_results/", pattern = "RData") %>%
stringr::str_remove(".RData")
data_split = model_data %>%
dplyr::filter(!species %in% species_processed) %>%
dplyr::group_by(species) %>%
dplyr::group_split()
for(pa_spec in data_split){
species = pa_spec$species[1]
print(species)
if(all(is.na(pa_spec$fold_eval))){
print("Too few samples")
next
}
# Define empty result for performance eval
na_performance = list(
AUC = NA,
Accuracy = NA,
Kappa = NA,
Precision = NA,
Recall = NA,
F1 = NA
)
# Create factor presence column
pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P"))
# Outer CV loop (for averaging performance metrics)
performance_cv = lapply(sort(unique(pa_spec$fold_eval)), function(k){
print(paste("Fold", k))
## Preparations #####
data_test = dplyr::filter(pa_spec, fold_eval == k)
data_train = dplyr::filter(pa_spec, fold_eval != k)
if(nrow(data_test) == 0 || nrow(data_train) == 0){
warning("Too few samples")
}
# Create inner CV folds for model training
cv_train = blockCV::cv_spatial(
data_train,
column = "present",
k = 5,
progress = F, plot = F, report = F
)
data_train$fold_train = cv_train$folds_ids
# Drop geometry
data_train$geometry = NULL
data_test$geometry = NULL
# Define caret training routine
index_train = lapply(unique(sort(data_train$fold_train)), function(x){
return(which(data_train$fold_train != x))
})
train_ctrl = caret::trainControl(
search = "random",
classProbs = TRUE,
index = index_train,
summaryFunction = caret::twoClassSummary,
savePredictions = "final",
)
# Define predictors
predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness")
## Random Forest #####
rf_performance = tryCatch({
# Fit model
rf_fit = caret::train(
x = data_train[, predictors],
y = data_train$present_fct,
method = "rf",
trControl = train_ctrl,
tuneLength = 4,
verbose = F
)
evaluate_model(rf_fit, data_test)
}, error = function(e){
na_performance
})
## Gradient Boosted Machine ####
gbm_performance = tryCatch({
gbm_fit = train(
x = data_train[, predictors],
y = data_train$present_fct,
method = "gbm",
trControl = train_ctrl,
tuneLength = 4,
verbose = F
)
evaluate_model(gbm_fit, data_test)
}, error = function(e){
na_performance
})
## Generalized Additive Model ####
gam_performance = tryCatch({
gam_fit = train(
x = data_train[, predictors],
y = data_train$present_fct,
method = "gamSpline",
tuneLength = 4,
trControl = train_ctrl
)
evaluate_model(gam_fit, data_test)
}, error = function(e){
na_performance
})
## Neural Network ####
formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+')))
nn_performance = tryCatch({
nn_fit = dnn(
formula,
data = data_train,
hidden = c(200L, 200L, 200L),
loss = "binomial",
activation = c("sigmoid", "leaky_relu", "leaky_relu"),
epochs = 200L,
burnin = 100L,
lr = 0.001,
batchsize = max(nrow(data_test)/10, 64),
lambda = 0.0001,
dropout = 0.2,
optimizer = config_optimizer("adam", weight_decay = 0.001),
lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 50, factor = 0.7),
early_stopping = 100,
validation = 0.2,
device = "cuda",
verbose = F,
plot = F
)
evaluate_model(nn_fit, data_test)
}, error = function(e){
na_performance
})
# Summarize results
performance_summary = tibble(
species = !!species,
obs = nrow(data_train),
fold_eval = k,
model = c("RF", "GBM", "GAM", "NN"),
auc = c(rf_performance$AUC, gbm_performance$AUC, gam_performance$AUC, nn_performance$AUC),
accuracy = c(rf_performance$Accuracy, gbm_performance$Accuracy, gam_performance$Accuracy, nn_performance$Accuracy),
kappa = c(rf_performance$Kappa, gbm_performance$Kappa, gam_performance$Kappa, nn_performance$Kappa),
precision = c(rf_performance$Precision, gbm_performance$Precision, gam_performance$Precision, nn_performance$Precision),
recall = c(rf_performance$Recall, gbm_performance$Recall, gam_performance$Recall, nn_performance$Recall),
f1 = c(rf_performance$F1, gbm_performance$F1, gam_performance$F1, nn_performance$F1)
) %>%
tidyr::pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value")
return(performance_summary)
})
# Combine and save evaluation results
performance_spec = bind_rows(performance_cv) %>%
dplyr::arrange(fold_eval, model, metric)
save(performance_spec, file = paste0("data/r_objects/ssdm_results/", species, ".RData"))
}
# Combine results
files = list.files("data/r_objects/ssdm_results/", full.names = T, pattern = ".RData")
ssdm_results = lapply(files, function(f){load(f); return(performance_spec)}) %>%
bind_rows()
save(ssdm_results, file = "data/r_objects/ssdm_results.RData")
# ----------------------------------------------------------------------#
# Train full models ####
# ----------------------------------------------------------------------#
data_split = model_data %>%
dplyr::filter(!is.na(fold_eval)) %>%
dplyr::mutate(present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))) %>%
dplyr::group_by(species) %>%
dplyr::group_split()
for(pa_spec in data_split){
species = pa_spec$species[1]
print(species)
# Create inner CV folds for model training
cv_train = blockCV::cv_spatial(
pa_spec,
column = "present",
k = 5,
progress = F, plot = F, report = F
)
pa_spec$fold_train = cv_train$folds_ids
# Drop geometry
pa_spec$geometry = NULL
pa_spec$geometry = NULL
# Define caret training routine
index_train = lapply(unique(sort(pa_spec$fold_train)), function(x){
return(which(pa_spec$fold_train != x))
})
train_ctrl = caret::trainControl(
search = "random",
classProbs = TRUE,
index = index_train,
summaryFunction = caret::twoClassSummary,
savePredictions = "final",
)
# Define predictors
predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness")
# Fit models
try({
rf_fit = caret::train(
x = pa_spec[, predictors],
y = pa_spec$present_fct,
method = "rf",
trControl = train_ctrl,
tuneLength = 4,
verbose = F
)
save(rf_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_rf_fit.RData"))
})
try({
gbm_fit = train(
x = pa_spec[, predictors],
y = pa_spec$present_fct,
method = "gbm",
trControl = train_ctrl,
tuneLength = 4,
verbose = F
)
save(gbm_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_gbm_fit.RData"))
})
try({
gam_fit = train(
x = pa_spec[, predictors],
y = pa_spec$present_fct,
method = "gamSpline",
tuneLength = 4,
trControl = train_ctrl
)
save(gam_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_gam_fit.RData"))
})
try({
formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+')))
nn_fit = dnn(
formula,
data = pa_spec,
hidden = c(200L, 200L, 200L),
loss = "binomial",
activation = c("sigmoid", "leaky_relu", "leaky_relu"),
epochs = 200L,
burnin = 100L,
lr = 0.001,
batchsize = max(nrow(pa_spec)/10, 64),
lambda = 0.0001,
dropout = 0.2,
optimizer = config_optimizer("adam", weight_decay = 0.001),
lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 50, factor = 0.7),
early_stopping = 100,
validation = 0.2,
device = "cuda",
verbose = F,
plot = F
)
save(nn_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_nn_fit.RData"))
})
}