diff --git a/R/03_04_modelling_msdm_rf.R b/R/03_04_modelling_msdm_rf.R index 1e03f70098f0c9086cdd9afc515fbb3578d2704c..f0f187c920323afe75be76eb3e2ea9669f9fca17 100644 --- a/R/03_04_modelling_msdm_rf.R +++ b/R/03_04_modelling_msdm_rf.R @@ -46,7 +46,7 @@ for(fold in 1:5){ num.threads = 48 ) - save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold,".RData")) + save(rf_fit, file = paste0("data/r_objects/msdm_rf_results/msdm_rf_fit_fold", fold,".RData")) } # Full model @@ -71,13 +71,13 @@ rf_fit = caret::train( num.threads = 48 ) -save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_full.RData") +save(rf_fit, file = "data/r_objects/msdm_rf_results/msdm_rf_fit_full.RData") # ----------------------------------------------------------------------# # Evaluate model #### # ----------------------------------------------------------------------# msdm_rf_performance = lapply(1:5, function(fold){ - load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold, ".RData")) + load(paste0("data/r_objects/msdm_rf_results/msdm_rf_fit_fold", fold, ".RData")) test_data = model_data %>% dplyr::filter(fold_global == fold) %>% @@ -137,4 +137,4 @@ msdm_rf_performance = lapply(1:5, function(fold){ }) %>% bind_rows() -save(msdm_rf_performance, file = paste0("data/r_objects/msdm_rf_performance.RData")) +save(msdm_rf_performance, file = paste0("data/r_objects/msdm_rf_results_performance.RData")) diff --git a/R/utils.R b/R/utils.R index 464072b436ddbd8bed11a5a5fdbfb0cfe84470a7..d324645483d6334dd891a10181d112f66dd8e19a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,7 +1,42 @@ -expand_bbox <- function(bbox, min_span = 1, expansion = 0.25) { +setup_dirs = function(){ + checkmate::assert_true(basename(getwd()) == "symobio-modeling") + + dirs = c( + "data", + "data/geospatial", + "data/phylogenies", + "data/r_objects", + "data/r_objects/pa_sampling", + "data/r_objects/ssdm_results", + "data/r_objects/msdm_embed_results", + "data/r_objects/msdm_onehot_results", + "data/r_objects/msdm_embed_results", + "data/r_objects/msdm_rf_results", + "plots", + "plots/pa_sampling", + "plots/publication", + "plots/range_predictions" + ) + + sapply(dirs, function(dir){ + tryCatch({ + if(!dir.exists(dir)){ + dir.create(dir) + } else { + message("Skipping directory '", dir, "': already exists") + } + }, error = function(e){ + message("Couldn't create directory '", dir, "': ", e) + }) + }) + + return(invisible(NULL)) +} + +expand_bbox = function(bbox, min_span = 1, expansion = 0.25) { # Get current bbox dimensions - x_range <- bbox["xmax"] - bbox["xmin"] - y_range <- bbox["ymax"] - bbox["ymin"] + x_range = bbox["xmax"] - bbox["xmin"] + y_range = bbox["ymax"] - bbox["ymin"] x_expand = expansion y_expand = expansion @@ -17,10 +52,10 @@ expand_bbox <- function(bbox, min_span = 1, expansion = 0.25) { } # Expand the limits, adjusting both directions correctly - bbox["xmin"] <- bbox["xmin"] - (x_expand * x_range) - bbox["xmax"] <- bbox["xmax"] + (x_expand * x_range) - bbox["ymin"] <- bbox["ymin"] - (y_expand * y_range) - bbox["ymax"] <- bbox["ymax"] + (y_expand * y_range) + bbox["xmin"] = bbox["xmin"] - (x_expand * x_range) + bbox["xmax"] = bbox["xmax"] + (x_expand * x_range) + bbox["ymin"] = bbox["ymin"] - (y_expand * y_range) + bbox["ymax"] = bbox["ymax"] + (y_expand * y_range) return(bbox) } @@ -47,7 +82,7 @@ predict_new = function(model, data, type = "prob"){ } } -evaluate_model <- function(model, data) { +evaluate_model = function(model, data) { # Accuracy: The proportion of correctly predicted instances (both true positives and true negatives) out of the total instances. # Formula: Accuracy = (TP + TN) / (TP + TN + FP + FN) @@ -75,7 +110,7 @@ evaluate_model <- function(model, data) { auc = pROC::roc(actual, probs, levels = c("P", "A"), direction = ">")$auc # Calculate confusion matrix - cm <- caret::confusionMatrix(preds, actual, positive = "P") + cm = caret::confusionMatrix(preds, actual, positive = "P") # Return metrics return( diff --git a/README.md b/README.md index e3aa5ea57899c5cef869d495fd302c3e1ed32ea7..f015a0602f874670af787202e7f8767cd8a74771 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,18 @@ -# Codebase Documentation +# Symobio Modeling -This repository implements a species distribution modeling comparison study for about 600 South American mammal species. Specifically, the study compares different modeling approaches for predicting species distributions. +Code for a comparative SDM study for about 600 South American mammal species. Specifically, the study compares different modeling approaches for predicting species distributions. + +An analysis of model performance can be found here: https://chrkoenig.quarto.pub/sdm-performance-report/ ## Project Structure - **`R/`**: Contains all the R scripts organized by workflow steps. -- **`renv/`**: Manages package dependencies for reproducibility. - **`Symobio_modeling.Rproj`**: RStudio project file for easy navigation. - **`README.md`**: High-level overview of the project. -- **`occurrences.png`**: Visualization or reference image for occurrences data. -- **`.Rprofile`**: Custom R environment settings. +- **`renv/`**: Manages package dependencies for reproducibility. - **`renv.lock`**: Lockfile for `renv` to ensure consistent package versions. +- **`data/`**: Input data (geo, phylo), intermediate data and modeling results +- **`plots/`**: Plots for visualizing data processing and analysis steps ## Workflow Overview @@ -21,7 +23,7 @@ Pre-process species-specific and environmental information for model fitting and - **`01_01_range_preparation.R`**: Process species range maps and calculate range dissimilarity. - **`01_02_traits_preparation.R`**: Prepare species trait data and calculate functional distances. -- **`01_03_phylo_preparation.R`**: Process phylogenetic information and alculate phylogenetic distances. +- **`01_03_phylo_preparation.R`**: Process phylogenetic information and calculate phylogenetic distances. - **`01_04_raster_preparation.R`**: Prepare environmental raster layers for modeling for data extraction. ### 2. Presence/Absence Data Processing @@ -56,7 +58,8 @@ Analyse modeling results ```r renv::restore() ``` -3. Run the scripts in the R/ directory sequentially. Some scripts, especially for model fitting, may run a long time and benefit from powerful hardware. +3. Set up the directory structure using the `setup_dirs()` function in the `utils.R` +4. Run the scripts in the R/ directory sequentially. Some scripts, especially for model fitting, may run a long time and benefit from powerful hardware. ## Additional Notes - Ensure that all required input data (e.g., range maps, raster files) is available in the expected directories.