library(furrr) library(dplyr) library(tidyr) library(sf) library(caret) library(cito) library(pROC) source("R/utils.R") load("data/r_objects/model_data.RData") # ----------------------------------------------------------------------# # Run training #### # ----------------------------------------------------------------------# species_processed = list.files("data/r_objects/ssdm_results/", pattern = "RData") %>% stringr::str_remove(".RData") data_split = model_data %>% dplyr::filter(!species %in% species_processed) %>% dplyr::group_by(species) %>% dplyr::group_split() for(pa_spec in data_split){ species = pa_spec$species[1] print(species) if(all(is.na(pa_spec$fold_eval))){ print("Too few samples") next } # Define empty result for performance eval na_performance = list( AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA ) # Create factor presence column pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P")) # Outer CV loop (for averaging performance metrics) performance_cv = lapply(sort(unique(pa_spec$fold_eval)), function(k){ print(paste("Fold", k)) ## Preparations ##### data_test = dplyr::filter(pa_spec, fold_eval == k) data_train = dplyr::filter(pa_spec, fold_eval != k) if(nrow(data_test) == 0 || nrow(data_train) == 0){ warning("Too few samples") } # Create inner CV folds for model training cv_train = blockCV::cv_spatial( data_train, column = "present", k = 5, progress = F, plot = F, report = F ) data_train$fold_train = cv_train$folds_ids # Drop geometry data_train$geometry = NULL data_test$geometry = NULL # Define caret training routine index_train = lapply(unique(sort(data_train$fold_train)), function(x){ return(which(data_train$fold_train != x)) }) train_ctrl = caret::trainControl( search = "random", classProbs = TRUE, index = index_train, summaryFunction = caret::twoClassSummary, savePredictions = "final", ) # Define predictors predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") ## Random Forest ##### rf_performance = tryCatch({ # Fit model rf_fit = caret::train( x = data_train[, predictors], y = data_train$present_fct, method = "rf", trControl = train_ctrl, tuneLength = 4, verbose = F ) evaluate_model(rf_fit, data_test) }, error = function(e){ na_performance }) ## Gradient Boosted Machine #### gbm_performance = tryCatch({ gbm_fit = train( x = data_train[, predictors], y = data_train$present_fct, method = "gbm", trControl = train_ctrl, tuneLength = 4, verbose = F ) evaluate_model(gbm_fit, data_test) }, error = function(e){ na_performance }) ## Generalized Additive Model #### gam_performance = tryCatch({ gam_fit = train( x = data_train[, predictors], y = data_train$present_fct, method = "gamSpline", tuneLength = 4, trControl = train_ctrl ) evaluate_model(gam_fit, data_test) }, error = function(e){ na_performance }) ## Neural Network #### formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'))) nn_performance = tryCatch({ nn_fit = dnn( formula, data = data_train, hidden = c(200L, 200L, 200L), loss = "binomial", activation = c("sigmoid", "leaky_relu", "leaky_relu"), epochs = 200L, burnin = 100L, lr = 0.001, batchsize = max(nrow(data_test)/10, 64), lambda = 0.0001, dropout = 0.2, optimizer = config_optimizer("adam", weight_decay = 0.001), lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 50, factor = 0.7), early_stopping = 100, validation = 0.2, device = "cuda", verbose = F, plot = F ) evaluate_model(nn_fit, data_test) }, error = function(e){ na_performance }) # Summarize results performance_summary = tibble( species = !!species, obs = nrow(data_train), fold_eval = k, model = c("RF", "GBM", "GAM", "NN"), auc = c(rf_performance$AUC, gbm_performance$AUC, gam_performance$AUC, nn_performance$AUC), accuracy = c(rf_performance$Accuracy, gbm_performance$Accuracy, gam_performance$Accuracy, nn_performance$Accuracy), kappa = c(rf_performance$Kappa, gbm_performance$Kappa, gam_performance$Kappa, nn_performance$Kappa), precision = c(rf_performance$Precision, gbm_performance$Precision, gam_performance$Precision, nn_performance$Precision), recall = c(rf_performance$Recall, gbm_performance$Recall, gam_performance$Recall, nn_performance$Recall), f1 = c(rf_performance$F1, gbm_performance$F1, gam_performance$F1, nn_performance$F1) ) %>% tidyr::pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value") return(performance_summary) }) # Combine and save evaluation results performance_spec = bind_rows(performance_cv) %>% dplyr::arrange(fold_eval, model, metric) save(performance_spec, file = paste0("data/r_objects/ssdm_results/", species, ".RData")) } # Combine results files = list.files("data/r_objects/ssdm_results/", full.names = T, pattern = ".RData") ssdm_results = lapply(files, function(f){load(f); return(performance_spec)}) %>% bind_rows() save(ssdm_results, file = "data/r_objects/ssdm_results.RData") # ----------------------------------------------------------------------# # Train full models #### # ----------------------------------------------------------------------# data_split = model_data %>% dplyr::filter(!is.na(fold_eval)) %>% dplyr::mutate(present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))) %>% dplyr::group_by(species) %>% dplyr::group_split() for(pa_spec in data_split){ species = pa_spec$species[1] print(species) # Create inner CV folds for model training cv_train = blockCV::cv_spatial( pa_spec, column = "present", k = 5, progress = F, plot = F, report = F ) pa_spec$fold_train = cv_train$folds_ids # Drop geometry pa_spec$geometry = NULL pa_spec$geometry = NULL # Define caret training routine index_train = lapply(unique(sort(pa_spec$fold_train)), function(x){ return(which(pa_spec$fold_train != x)) }) train_ctrl = caret::trainControl( search = "random", classProbs = TRUE, index = index_train, summaryFunction = caret::twoClassSummary, savePredictions = "final", ) # Define predictors predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") # Fit models try({ rf_fit = caret::train( x = pa_spec[, predictors], y = pa_spec$present_fct, method = "rf", trControl = train_ctrl, tuneLength = 4, verbose = F ) save(rf_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_rf_fit.RData")) }) try({ gbm_fit = train( x = pa_spec[, predictors], y = pa_spec$present_fct, method = "gbm", trControl = train_ctrl, tuneLength = 4, verbose = F ) save(gbm_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_gbm_fit.RData")) }) try({ gam_fit = train( x = pa_spec[, predictors], y = pa_spec$present_fct, method = "gamSpline", tuneLength = 4, trControl = train_ctrl ) save(gam_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_gam_fit.RData")) }) try({ formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'))) nn_fit = dnn( formula, data = pa_spec, hidden = c(200L, 200L, 200L), loss = "binomial", activation = c("sigmoid", "leaky_relu", "leaky_relu"), epochs = 200L, burnin = 100L, lr = 0.001, batchsize = max(nrow(pa_spec)/10, 64), lambda = 0.0001, dropout = 0.2, optimizer = config_optimizer("adam", weight_decay = 0.001), lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 50, factor = 0.7), early_stopping = 100, validation = 0.2, device = "cuda", verbose = F, plot = F ) save(nn_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_nn_fit.RData")) }) }