diff --git a/R/03_presence_absence_preparation.R b/R/03_presence_absence_preparation.R index 9fd817c71fa460cf27c87aa9d023296999392b67..78d9327dd475664272dfdfd82d8bdc4263b17e5e 100644 --- a/R/03_presence_absence_preparation.R +++ b/R/03_presence_absence_preparation.R @@ -126,14 +126,9 @@ model_data = furrr::future_map(occs_split, .progress = TRUE, .options = furrr::f pa_spec = occs_spec %>% dplyr::mutate(present = 1) %>% bind_rows(abs_spec) - - # Split into train and test datasets - train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE) - pa_spec$train = 0 - pa_spec$train[train_index] = 1 - + # Define cross-validation folds - folds = tryCatch({ + folds_eval = tryCatch({ spatial_folds = suppressMessages( blockCV::cv_spatial( pa_spec, @@ -150,11 +145,11 @@ model_data = furrr::future_map(occs_split, .progress = TRUE, .options = furrr::f NA }) - pa_spec$folds = folds - pa_spec$geometry = NULL + + pa_spec$fold_eval = folds_eval return(pa_spec) }) model_data = bind_rows(model_data) -save(model_data, file = "data/r_objects/model_data.RData") +save(model_data, file = "data/r_objects/nest_cv/model_data.RData") diff --git a/R/04_01_ssdm_modeling.R b/R/04_01_ssdm_modeling.R index 959f77c38589b6ad8594c0a3d465aaa6f9907f7a..ddef6965715f976e9bf90267a213daa1c7ec4585 100644 --- a/R/04_01_ssdm_modeling.R +++ b/R/04_01_ssdm_modeling.R @@ -7,147 +7,161 @@ library(pROC) source("R/utils.R") -load("data/r_objects/model_data.RData") +load("data/r_objects/full_evaluation/model_data.RData") data_split = split(model_data, model_data$species) +# Define empty result for performance eval +performance = list( + AUC = numeric(0), + Accuracy = numeric(0), + Kappa = numeric(0), + Precision = numeric(0), + Recall = numeric(0), + F1 = numeric(0) +) + # ----------------------------------------------------------------------# # Train models #### # ----------------------------------------------------------------------# future::plan("multisession", workers = 8) ssdm_results = furrr::future_map(data_split, .options = furrr::furrr_options(seed = 123), .f = function(data_spec){ - # Initial check - if(nrow(data_spec) < 10 | anyNA(data_spec$folds)){ - return(NULL) - } - data_spec$present_fct = factor(data_spec$present, levels = c("0", "1"), labels = c("A", "P")) - train_data = dplyr::filter(data_spec, train == 1) - test_data = dplyr::filter(data_spec, train == 0) - - # Define empty result for performance eval - na_performance = list( - AUC = NA, - Accuracy = NA, - Kappa = NA, - Precision = NA, - Recall = NA, - F1 = NA - ) - - # Define predictors - predictors = paste0("layer_", 1:19) - - # Define caret training routine ##### - index_train = lapply(unique(sort(train_data$fold)), function(x){ - return(which(train_data$fold != x)) - }) - train_ctrl = trainControl( - search = "grid", - classProbs = TRUE, - index = index_train, - summaryFunction = twoClassSummary, - savePredictions = "final" - ) - - # Random Forest ##### - rf_performance = tryCatch({ - rf_grid = expand.grid( - mtry = c(3,7,11,15,19) # Number of randomly selected predictors - ) + # Outer CV loop + for(k in sort(unique(data_spec$fold_eval))){ + data_test = dplyr::filter(data_spec, fold_eval == k) + data_train = dplyr::filter(data_spec, fold_eval != k) - rf_fit = caret::train( - x = train_data[, predictors], - y = train_data$present_fct, - method = "rf", - metric = "ROC", - tuneGrid = rf_grid, - trControl = train_ctrl - ) - evaluate_model(rf_fit, test_data) - }, error = function(e){ - na_performance - }) - - # Gradient Boosted Machine #### - gbm_performance = tryCatch({ - gbm_grid <- expand.grid( - n.trees = c(100, 500, 1000, 1500), # number of trees - interaction.depth = c(3, 5, 7), # Maximum depth of each tree - shrinkage = c(0.01, 0.005, 0.001), # Lower learning rates - n.minobsinnode = c(10, 20) # Minimum number of observations in nodes - ) + if(nrow(data_test) == 0 || nrow(data_train) == 0){ + return() + } - gbm_fit = train( - x = train_data[, predictors], - y = train_data$present_fct, - method = "gbm", - metric = "ROC", - verbose = F, - tuneGrid = gbm_grid, - trControl = train_ctrl + # Create inner CV folds for model training + cv_train = blockCV::cv_spatial( + data_train, + column = "present", + k = 5, + progress = F, plot = F, report = F ) - evaluate_model(gbm_fit, test_data) - }, error = function(e){ - na_performance - }) - - # Generalized additive Model #### - glm_performance = tryCatch({ - glm_fit = train( - x = train_data[, predictors], - y = train_data$present_fct, - method = "glm", - family=binomial, - metric = "ROC", - preProcess = c("center", "scale"), - trControl = train_ctrl + data_train$fold_train = cv_train$folds_ids + + # Define caret training routine ##### + index_train = lapply(unique(sort(data_train$fold_train)), function(x){ + return(which(data_train$fold_train != x)) + }) + + train_ctrl = trainControl( + search = "grid", + classProbs = TRUE, + index = index_train, + summaryFunction = twoClassSummary, + savePredictions = "final" ) - evaluate_model(glm_fit, test_data) - }, error = function(e){ - na_performance - }) - # Neural Network #### - nn_performance = tryCatch({ - nn_fit = dnn( - X = train_data[, predictors], - Y = train_data$present, - hidden = c(200L, 200L, 200L), - loss = "binomial", - activation = c("sigmoid", "leaky_relu", "leaky_relu"), - epochs = 500L, - lr = 0.02, - baseloss=10, - batchsize=nrow(train_data)/4, - dropout = 0.1, # Regularization - optimizer = config_optimizer("adam", weight_decay = 0.001), - lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), - early_stopping = 200, # stop training when validation loss does not decrease anymore - validation = 0.3, # used for early stopping and lr_scheduler - device = "cuda", - bootstrap = 5 - ) + # Define predictors + predictors = paste0("layer_", 1:19) - evaluate_model(nn_fit, test_data) - }, error = function(e){ - na_performance - }) - - # Summarize results - performance_summary = tibble( - species = data_spec$species[1], - obs = nrow(data_spec), - model = c("SSDM_RF", "SSDM_GBM", "SSDM_GLM", "SSDM_NN"), - auc = c(rf_performance$AUC, gbm_performance$AUC, glm_performance$AUC, nn_performance$AUC), - accuracy = c(rf_performance$Accuracy, gbm_performance$Accuracy, glm_performance$Accuracy, nn_performance$Accuracy), - kappa = c(rf_performance$Kappa, gbm_performance$Kappa, glm_performance$Kappa, nn_performance$Kappa), - precision = c(rf_performance$Precision, gbm_performance$Precision, glm_performance$Precision, nn_performance$Precision), - recall = c(rf_performance$Recall, gbm_performance$Recall, glm_performance$Recall, nn_performance$Recall), - f1 = c(rf_performance$F1, gbm_performance$F1, glm_performance$F1, nn_performance$F1) - ) - + # Random Forest ##### + rf_performance = tryCatch({ + rf_grid = expand.grid( + mtry = c(3,7,11,15,19) # Number of randomly selected predictors + ) + + rf_fit = caret::train( + x = data_train[, predictors], + y = data_train$present_fct, + method = "rf", + metric = "ROC", + tuneGrid = rf_grid, + trControl = train_ctrl + ) + evaluate_model(rf_fit, data_train) + }, error = function(e){ + na_performance + }) + + # Gradient Boosted Machine #### + gbm_performance = tryCatch({ + gbm_grid <- expand.grid( + n.trees = c(100, 500, 1000, 1500), # number of trees + interaction.depth = c(3, 5, 7), # Maximum depth of each tree + shrinkage = c(0.01, 0.005, 0.001), # Lower learning rates + n.minobsinnode = c(10, 20) # Minimum number of observations in nodes + ) + + gbm_fit = train( + x = data_train[, predictors], + y = data_train$present_fct, + method = "gbm", + metric = "ROC", + verbose = F, + tuneGrid = gbm_grid, + trControl = train_ctrl + ) + evaluate_model(gbm_fit, data_test) + }, error = function(e){ + na_performance + }) + + # Generalized additive Model #### + glm_performance = tryCatch({ + glm_fit = train( + x = data_train[, predictors], + y = data_train$present_fct, + method = "glm", + family=binomial, + metric = "ROC", + preProcess = c("center", "scale"), + trControl = train_ctrl + ) + evaluate_model(glm_fit, data_test) + }, error = function(e){ + na_performance + }) + + # Neural Network #### + nn_performance = tryCatch({ + predictors = paste0("layer_", 1:19) + formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'))) + + nn_fit = dnn( + formula, + data = data_train, + hidden = c(500L, 500L, 500L), + loss = "binomial", + activation = c("sigmoid", "leaky_relu", "leaky_relu"), + epochs = 500L, + lr = 0.001, + baseloss = 1, + batchsize = nrow(data_train), + dropout = 0.1, + burnin = 100, + optimizer = config_optimizer("adam", weight_decay = 0.001), + lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), + early_stopping = 250, + validation = 0.3, + device = "cuda" + ) + + evaluate_model(nn_fit, data_train) + }, error = function(e){ + na_performance + }) + + # Summarize results + performance_summary = tibble( + species = data_train$species[1], + obs = nrow(data_train), + model = c("SSDM_RF", "SSDM_GBM", "SSDM_GLM", "SSDM_NN"), + auc = c(rf_performance$AUC, gbm_performance$AUC, glm_performance$AUC, nn_performance$AUC), + accuracy = c(rf_performance$Accuracy, gbm_performance$Accuracy, glm_performance$Accuracy, nn_performance$Accuracy), + kappa = c(rf_performance$Kappa, gbm_performance$Kappa, glm_performance$Kappa, nn_performance$Kappa), + precision = c(rf_performance$Precision, gbm_performance$Precision, glm_performance$Precision, nn_performance$Precision), + recall = c(rf_performance$Recall, gbm_performance$Recall, glm_performance$Recall, nn_performance$Recall), + f1 = c(rf_performance$F1, gbm_performance$F1, glm_performance$F1, nn_performance$F1) + ) + } return(performance_summary) }) diff --git a/R/04_03_msdm_embed_raw.R b/R/04_03_msdm_embed_raw.R index 6e7924a172401b7f428a55183f580eafa8ec0bc5..03a8737f138b5ba220b92c244e46a368c5ccc855 100644 --- a/R/04_03_msdm_embed_raw.R +++ b/R/04_03_msdm_embed_raw.R @@ -34,7 +34,7 @@ msdm_fit_embedding_raw = dnn( lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7), early_stopping = 250, validation = 0.3, - device = "cuda", + device = "cuda" ) save(msdm_fit_embedding_raw, file = "data/r_objects/msdm_fit_embedding_raw.RData") diff --git a/R/utils.R b/R/utils.R index b1da26abc38515507c2381fa178c92bab4e50296..1c9dc02ef9d965b3ed9a02f7552aec85de46141f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -25,7 +25,7 @@ expand_bbox <- function(bbox, min_span = 1, expansion = 0.25) { return(bbox) } -evaluate_model <- function(model, test_data) { +evaluate_model <- function(model, data) { # Accuracy: The proportion of correctly predicted instances (both true positives and true negatives) out of the total instances. # Formula: Accuracy = (TP + TN) / (TP + TN + FP + FN) @@ -40,14 +40,14 @@ evaluate_model <- function(model, test_data) { # Predict probabilities if(class(model) %in% c("citodnn", "citodnnBootstrap")){ - probs <- predict(model, as.matrix(test_data), type = "response")[,1] + probs <- predict(model, as.matrix(data), type = "response")[,1] preds <- factor(round(probs), levels = c("0", "1"), labels = c("A", "P")) } else { - probs <- predict(model, test_data, type = "prob")$P - preds <- predict(model, test_data, type = "raw") + probs <- predict(model, data, type = "prob")$P + preds <- predict(model, data, type = "raw") } - actual <- factor(test_data$present, levels = c("0", "1"), labels = c("A", "P")) + actual <- factor(data$present, levels = c("0", "1"), labels = c("A", "P")) # Calculate AUC auc <- pROC::roc(actual, probs, levels = c("P", "A"), direction = ">")$auc