library(dplyr) library(tidyr) library(caret) library(ranger) source("R/utils.R") load("data/r_objects/model_data.RData") model_data = model_data %>% dplyr::mutate( present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P")) ) %>% sf::st_drop_geometry() # ----------------------------------------------------------------------# # Train model #### # ----------------------------------------------------------------------# # Define predictors predictors = c(paste0("bio", 1:19), "species") # Cross validation for(fold in 1:5){ print(paste("Fold", fold)) ## Preparations ##### data_train = model_data %>% dplyr::filter(record_type == "background" | fold_global != fold) train_ctrl = caret::trainControl( search = "random", classProbs = TRUE, number = 5, summaryFunction = caret::twoClassSummary, savePredictions = "final" ) # Run model rf_fit = caret::train( x = data_train[, predictors], y = data_train$present_fct, method = "ranger", metric = "Accuracy", trControl = train_ctrl, tuneLength = 8, weights = data_train$weight, num.threads = 48 ) save(rf_fit, file = paste0("data/r_objects/msdm_rf_results/msdm_rf_fit_fold", fold,".RData")) } # Full model # Define caret training routine train_ctrl = caret::trainControl( search = "random", classProbs = TRUE, number = 5, summaryFunction = caret::twoClassSummary, savePredictions = "final" ) # Run model rf_fit = caret::train( x = model_data[, predictors], y = model_data$present_fct, method = "ranger", metric = "Accuracy", trControl = train_ctrl, tuneLength = 8, weights = model_data$weight, num.threads = 48 ) save(rf_fit, file = "data/r_objects/msdm_rf_results/msdm_rf_fit_full.RData") # ----------------------------------------------------------------------# # Evaluate model #### # ----------------------------------------------------------------------# msdm_rf_performance = lapply(1:5, function(fold){ load(paste0("data/r_objects/msdm_rf_results/msdm_rf_fit_fold", fold, ".RData")) test_data = model_data %>% dplyr::filter(fold_global == fold) %>% sf::st_drop_geometry() actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P")) probs = predict_new(rf_fit, test_data, type = "prob") preds = predict_new(rf_fit, test_data, type = "class") eval_dfs = data.frame( species = test_data$species, actual, probs, preds ) %>% group_by(species) %>% group_split() lapply(eval_dfs, function(eval_df_spec){ species = eval_df_spec$species[1] performance = tryCatch({ auc = pROC::roc(eval_df_spec$actual, eval_df_spec$probs, levels = c("P", "A"), direction = ">")$auc cm = caret::confusionMatrix(eval_df_spec$preds, eval_df_spec$actual, positive = "P") list( auc = as.numeric(auc), accuracy = cm$overall["Accuracy"], kappa = cm$overall["Kappa"], precision = cm$byClass["Precision"], recall = cm$byClass["Recall"], f1 = cm$byClass["F1"], tp = cm$table["P", "P"], fp = cm$table["P", "A"], tn = cm$table["A", "A"], fn = cm$table["A", "P"] ) }, error = function(e){ list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_, precision = NA_real_, recall = NA_real_, f1 = NA_real_, tp = NA_real_, fp = NA_real_, tn = NA_real_, fn = NA_real_) }) performance_summary = performance %>% as_tibble() %>% mutate( species = !!species, obs = nrow(dplyr::filter(model_data, species == !!species, fold_global != !!fold)), fold_global = !!fold, model = "msdm_rf", ) %>% tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value") %>% drop_na() }) %>% bind_rows() }) %>% bind_rows() save(msdm_rf_performance, file = paste0("data/r_objects/msdm_rf_results_performance.RData"))