From 1bb495317a8494525841f7b6b4aea741cf30f444 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Wed, 29 Jan 2025 14:59:38 +0100
Subject: [PATCH] restructured files, moved forward with results analysis

---
 R/03_02_absence_preparation.R            |   1 -
 R/04_01_modelling_ssdm.R                 | 298 +++++++++
 R/04_01_ssdm_modeling.R                  | 199 ------
 R/04_02_modelling_msdm_embed.R           | 112 ++++
 R/04_02_msdm_multiclass.R                |  68 --
 R/04_03_msdm_embed_raw.R                 |  81 ---
 R/04_04_msdm_embed_traits.R              | 143 -----
 R/04_05_msdm_embed_phylo.R               | 143 -----
 R/04_06_msdm_embed_ranges.R              | 144 -----
 R/04_07_msdm_embed_multi_nolonlat.R      | 111 ----
 R/04_07_msdm_oneoff.R                    | 111 ----
 R/04_08_msdm_embed_multi_lonlat.R        | 107 ----
 R/05_01_performance_analysis.qmd         | 627 ------------------
 R/05_01_performance_analysis_carsten.qmd | 190 ------
 R/05_01_performance_report.qmd           | 375 +++++++++++
 R/05_02_MSDM_comparison.qmd              | 779 -----------------------
 R/05_02_publication_analysis.R           | 382 +++++++++++
 R/_publish.yml                           |   4 +
 R/utils.R                                |  18 +-
 README.md                                |  92 +--
 snippets.R                               | 160 +++++
 21 files changed, 1343 insertions(+), 2802 deletions(-)
 create mode 100644 R/04_01_modelling_ssdm.R
 delete mode 100644 R/04_01_ssdm_modeling.R
 create mode 100644 R/04_02_modelling_msdm_embed.R
 delete mode 100644 R/04_02_msdm_multiclass.R
 delete mode 100644 R/04_03_msdm_embed_raw.R
 delete mode 100644 R/04_04_msdm_embed_traits.R
 delete mode 100644 R/04_05_msdm_embed_phylo.R
 delete mode 100644 R/04_06_msdm_embed_ranges.R
 delete mode 100644 R/04_07_msdm_embed_multi_nolonlat.R
 delete mode 100644 R/04_07_msdm_oneoff.R
 delete mode 100644 R/04_08_msdm_embed_multi_lonlat.R
 delete mode 100644 R/05_01_performance_analysis.qmd
 delete mode 100644 R/05_01_performance_analysis_carsten.qmd
 create mode 100644 R/05_01_performance_report.qmd
 delete mode 100644 R/05_02_MSDM_comparison.qmd
 create mode 100644 R/05_02_publication_analysis.R
 create mode 100644 snippets.R

diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R
index 03865cb..2726f8c 100644
--- a/R/03_02_absence_preparation.R
+++ b/R/03_02_absence_preparation.R
@@ -31,7 +31,6 @@ sa_polygon = rnaturalearth::ne_countries() %>%
   sf::st_union() 
 
 raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% 
-  #raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice
   stringr::str_sort(numeric = T) 
 
 # Project (proj string generated with SA bbox coordinates on https://projectionwizard.org)
diff --git a/R/04_01_modelling_ssdm.R b/R/04_01_modelling_ssdm.R
new file mode 100644
index 0000000..c336fe3
--- /dev/null
+++ b/R/04_01_modelling_ssdm.R
@@ -0,0 +1,298 @@
+library(furrr)
+library(dplyr)
+library(tidyr)
+library(sf)
+library(caret)
+library(cito)
+library(pROC)
+
+source("R/utils.R")
+
+load("data/r_objects/model_data.RData")
+
+# ----------------------------------------------------------------------#
+# Run training                                                       ####
+# ----------------------------------------------------------------------#
+species_processed = list.files("data/r_objects/ssdm_results/", pattern = "RData") %>% 
+  stringr::str_remove(".RData")
+
+data_split = model_data %>% 
+  dplyr::filter(!species %in% species_processed) %>% 
+  dplyr::group_by(species) %>% 
+  dplyr::group_split()
+
+
+for(pa_spec in data_split){
+  species = pa_spec$species[1]
+  print(species)
+  
+  if(all(is.na(pa_spec$fold_eval))){
+    print("Too few samples")
+    next
+  }
+  
+  # Define empty result for performance eval
+  na_performance = list(    
+    AUC = NA,
+    Accuracy = NA,
+    Kappa = NA,
+    Precision = NA,
+    Recall = NA,
+    F1 = NA
+  )
+  
+  # Create factor presence column 
+  pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P"))
+  
+  # Outer CV loop (for averaging performance metrics)
+  performance_cv = lapply(sort(unique(pa_spec$fold_eval)), function(k){
+    print(paste("Fold", k))
+    
+    ## Preparations #####
+    data_test = dplyr::filter(pa_spec, fold_eval == k)
+    data_train = dplyr::filter(pa_spec, fold_eval != k)
+    
+    if(nrow(data_test) == 0 || nrow(data_train) == 0){
+      warning("Too few samples")
+    }
+    
+    # Create inner CV folds for model training
+    cv_train = blockCV::cv_spatial(
+      data_train,
+      column = "present",
+      k = 5,
+      progress = F, plot = F, report = F
+    )
+    data_train$fold_train = cv_train$folds_ids
+    
+    # Drop geometry
+    data_train$geometry = NULL
+    data_test$geometry = NULL
+    
+    # Define caret training routine 
+    index_train = lapply(unique(sort(data_train$fold_train)), function(x){
+      return(which(data_train$fold_train != x))
+    })
+    
+    train_ctrl = caret::trainControl(
+      search = "random",
+      classProbs = TRUE, 
+      index = index_train,
+      summaryFunction = caret::twoClassSummary, 
+      savePredictions = "final",
+    )
+    
+    # Define predictors
+    predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness")
+    
+    ## Random Forest #####
+    rf_performance = tryCatch({
+      # Fit model
+      rf_fit = caret::train(
+        x = data_train[, predictors],
+        y = data_train$present_fct,
+        method = "rf",
+        trControl = train_ctrl,
+        tuneLength = 4,
+        verbose = F
+      )
+      
+      evaluate_model(rf_fit, data_test)
+    }, error = function(e){
+      na_performance
+    })
+    
+    ## Gradient Boosted Machine ####
+    gbm_performance = tryCatch({
+      gbm_fit = train(
+        x = data_train[, predictors],
+        y = data_train$present_fct,
+        method = "gbm",
+        trControl = train_ctrl,
+        tuneLength = 4,
+        verbose = F
+      )
+      evaluate_model(gbm_fit, data_test)
+    }, error = function(e){
+      na_performance
+    })
+    
+    ## Generalized Additive Model ####
+    gam_performance = tryCatch({
+      gam_fit = train(
+        x = data_train[, predictors],
+        y = data_train$present_fct,
+        method = "gamSpline",
+        tuneLength = 4,
+        trControl = train_ctrl
+      )
+      evaluate_model(gam_fit, data_test)
+    }, error = function(e){
+      na_performance
+    })
+    
+    ## Neural Network ####
+    formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+')))
+    nn_performance = tryCatch({
+      nn_fit = dnn(
+        formula,
+        data = data_train,
+        hidden = c(200L, 200L, 200L),
+        loss = "binomial",
+        activation = c("sigmoid", "leaky_relu", "leaky_relu"),
+        epochs = 200L, 
+        burnin = 100L,
+        lr = 0.001,   
+        batchsize = max(nrow(data_test)/10, 64),
+        lambda = 0.0001,
+        dropout = 0.2,
+        optimizer = config_optimizer("adam", weight_decay = 0.001),
+        lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 50, factor = 0.7),
+        early_stopping = 100,
+        validation = 0.2,
+        device = "cuda",
+        verbose = F,
+        plot = F
+      )
+      
+      evaluate_model(nn_fit, data_test)  
+    }, error = function(e){
+      na_performance
+    })
+    
+    # Summarize results
+    performance_summary = tibble(
+      species = !!species,
+      obs = nrow(data_train),
+      fold_eval = k,
+      model = c("RF", "GBM", "GAM", "NN"),
+      auc = c(rf_performance$AUC, gbm_performance$AUC, gam_performance$AUC, nn_performance$AUC),
+      accuracy = c(rf_performance$Accuracy, gbm_performance$Accuracy, gam_performance$Accuracy, nn_performance$Accuracy),
+      kappa = c(rf_performance$Kappa, gbm_performance$Kappa, gam_performance$Kappa, nn_performance$Kappa),
+      precision = c(rf_performance$Precision, gbm_performance$Precision, gam_performance$Precision, nn_performance$Precision),
+      recall = c(rf_performance$Recall, gbm_performance$Recall, gam_performance$Recall, nn_performance$Recall),
+      f1 = c(rf_performance$F1, gbm_performance$F1, gam_performance$F1, nn_performance$F1)
+    ) %>% 
+      tidyr::pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value") 
+    
+    return(performance_summary)
+  })
+  
+  # Combine and save evaluation results
+  performance_spec = bind_rows(performance_cv) %>% 
+    dplyr::arrange(fold_eval, model, metric)
+  
+  save(performance_spec, file = paste0("data/r_objects/ssdm_results/", species, ".RData"))
+}
+
+
+# Combine results  
+files = list.files("data/r_objects/ssdm_results/", full.names = T, pattern = ".RData")
+ssdm_results = lapply(files, function(f){load(f); return(performance_spec)}) %>% 
+  bind_rows() 
+
+save(ssdm_results, file = "data/r_objects/ssdm_results.RData")
+
+# ----------------------------------------------------------------------#
+# Train full models                                                  ####
+# ----------------------------------------------------------------------#
+data_split = model_data %>% 
+  dplyr::filter(!is.na(fold_eval)) %>% 
+  dplyr::mutate(present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))) %>% 
+  dplyr::group_by(species) %>% 
+  dplyr::group_split()
+
+for(pa_spec in data_split){
+  species = pa_spec$species[1]
+  print(species)
+  
+  # Create inner CV folds for model training
+  cv_train = blockCV::cv_spatial(
+    pa_spec,
+    column = "present",
+    k = 5,
+    progress = F, plot = F, report = F
+  )
+  pa_spec$fold_train = cv_train$folds_ids
+  
+  # Drop geometry
+  pa_spec$geometry = NULL
+  pa_spec$geometry = NULL
+  
+  # Define caret training routine 
+  index_train = lapply(unique(sort(pa_spec$fold_train)), function(x){
+    return(which(pa_spec$fold_train != x))
+  })
+  
+  train_ctrl = caret::trainControl(
+    search = "random",
+    classProbs = TRUE, 
+    index = index_train,
+    summaryFunction = caret::twoClassSummary, 
+    savePredictions = "final",
+  )
+  
+  # Define predictors
+  predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness")
+  
+  # Fit models
+  try({
+    rf_fit = caret::train(
+      x = pa_spec[, predictors],
+      y = pa_spec$present_fct,
+      method = "rf",
+      trControl = train_ctrl,
+      tuneLength = 4,
+      verbose = F
+    )
+    save(rf_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_rf_fit.RData"))
+  })
+  
+  try({
+    gbm_fit = train(
+      x = pa_spec[, predictors],
+      y = pa_spec$present_fct,
+      method = "gbm",
+      trControl = train_ctrl,
+      tuneLength = 4,
+      verbose = F
+    )
+    save(gbm_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_gbm_fit.RData"))
+  })
+  
+  try({
+    gam_fit = train(
+      x = pa_spec[, predictors],
+      y = pa_spec$present_fct,
+      method = "gamSpline",
+      tuneLength = 4,
+      trControl = train_ctrl
+    )
+    save(gam_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_gam_fit.RData"))
+  })
+  
+  try({
+    formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+')))
+    nn_fit = dnn(
+      formula,
+      data = pa_spec,
+      hidden = c(200L, 200L, 200L),
+      loss = "binomial",
+      activation = c("sigmoid", "leaky_relu", "leaky_relu"),
+      epochs = 200L, 
+      burnin = 100L,
+      lr = 0.001,   
+      batchsize = max(nrow(pa_spec)/10, 64),
+      lambda = 0.0001,
+      dropout = 0.2,
+      optimizer = config_optimizer("adam", weight_decay = 0.001),
+      lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 50, factor = 0.7),
+      early_stopping = 100,
+      validation = 0.2,
+      device = "cuda",
+      verbose = F,
+      plot = F
+    )
+    save(nn_fit, file = paste0("data/r_objects/ssdm_results/models/", species, "_nn_fit.RData"))
+  })
+}
diff --git a/R/04_01_ssdm_modeling.R b/R/04_01_ssdm_modeling.R
deleted file mode 100644
index 661e169..0000000
--- a/R/04_01_ssdm_modeling.R
+++ /dev/null
@@ -1,199 +0,0 @@
-library(furrr)
-library(dplyr)
-library(tidyr)
-library(sf)
-library(caret)
-library(cito)
-library(pROC)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data.RData")
-
-# ----------------------------------------------------------------------#
-# Run training                                                       ####
-# ----------------------------------------------------------------------#
-data_split = model_data %>% 
-  dplyr::group_by(species) %>% 
-  dplyr::group_split()
-
-future::plan("multisession", workers = 16)
-
-furrr::future_walk(
-  .x = data_split, 
-  .options = furrr::furrr_options(
-    seed = 123,
-    packages = c("dplyr", "tidyr", "sf", "caret", "cito", "pROC"),
-    scheduling = FALSE  # make sure workers get constant stream of work
-  ),
-  .f = function(pa_spec){
-    species = pa_spec$species[1]
-    
-    if(all(is.na(pa_spec$fold_eval))){
-      warning("Too few samples")
-      return()
-    }
-    
-    # Define empty result for performance eval
-    na_performance = list(    
-      AUC = NA,
-      Accuracy = NA,
-      Kappa = NA,
-      Precision = NA,
-      Recall = NA,
-      F1 = NA
-    )
-    
-    # Create factor presence column 
-    pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P"))
-    
-    # Outer CV loop (for averaging performance metrics)
-    performance_cv = lapply(sort(unique(pa_spec$fold_eval)), function(k){
-      data_test = dplyr::filter(pa_spec, fold_eval == k)
-      data_train = dplyr::filter(pa_spec, fold_eval != k)
-      
-      if(nrow(data_test) == 0 || nrow(data_train) == 0){
-        warning("Too few samples")
-      }
-      
-      # Create inner CV folds for model training
-      cv_train = blockCV::cv_spatial(
-        data_train,
-        column = "present",
-        k = 5,
-        progress = F, plot = F, report = F
-      )
-      data_train$fold_train = cv_train$folds_ids
-      
-      # Drop geometry
-      data_train$geometry = NULL
-      data_test$geometry = NULL
-      
-      # Define caret training routine #####
-      index_train = lapply(unique(sort(data_train$fold_train)), function(x){
-        return(which(data_train$fold_train != x))
-      })
-      
-      train_ctrl = caret::trainControl(
-        search = "random",
-        classProbs = TRUE, 
-        index = index_train,
-        summaryFunction = caret::twoClassSummary, 
-        savePredictions = "final",
-      )
-      
-      # Define predictors
-      predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness")
-      
-      # Random Forest #####
-      rf_performance = tryCatch({
-        # Fit model
-        rf_fit = caret::train(
-          x = data_train[, predictors],
-          y = data_train$present_fct,
-          method = "rf",
-          trControl = train_ctrl,
-          tuneLength = 4,
-          verbose = F
-        )
-        
-        evaluate_model(rf_fit, data_test)
-      }, error = function(e){
-        na_performance
-      })
-      
-      # Gradient Boosted Machine ####
-      gbm_performance = tryCatch({
-        gbm_fit = train(
-          x = data_train[, predictors],
-          y = data_train$present_fct,
-          method = "gbm",
-          trControl = train_ctrl,
-          tuneLength = 4,
-          verbose = F
-        )
-        evaluate_model(gbm_fit, data_test)
-      }, error = function(e){
-        na_performance
-      })
-      
-      # Generalized Additive Model ####
-      gam_performance = tryCatch({
-        gam_fit = train(
-          x = data_train[, predictors],
-          y = data_train$present_fct,
-          method = "gamSpline",
-          tuneLength = 4,
-          trControl = train_ctrl
-        )
-        evaluate_model(gam_fit, data_test)
-      }, error = function(e){
-        na_performance
-      })
-      
-      # Neural Network ####
-      nn_performance = tryCatch({
-        formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+')))
-        
-        nn_fit = dnn(
-          formula,
-          data = data_train,
-          hidden = c(100L, 100L, 100L),
-          loss = "binomial",
-          activation = c("leaky_relu", "leaky_relu", "leaky_relu"),
-          epochs = 500L, 
-          burnin = 100L,
-          lr = 0.001,   
-          batchsize = max(nrow(data_test)/10, 32),
-          lambda = 0.01,
-          dropout = 0.2,
-          optimizer = config_optimizer("adam", weight_decay = 0.001),
-          lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 50, factor = 0.7),
-          early_stopping = 100,
-          validation = 0.2,
-          device = "cuda",
-          verbose = F,
-          plot = F
-        )
-        
-        if(nn_fit$successfull == 1){
-          evaluate_model(nn_fit, data_test)  
-        } else {
-          na_performance
-        }
-      }, error = function(e){
-        na_performance
-      })
-      
-      # Summarize results
-      performance_summary = tibble(
-        species = !!species,
-        obs = nrow(data_train),
-        fold_eval = k,
-        model = c("RF", "GBM", "GAM", "NN"),
-        auc = c(rf_performance$AUC, gbm_performance$AUC, gam_performance$AUC, nn_performance$AUC),
-        accuracy = c(rf_performance$Accuracy, gbm_performance$Accuracy, gam_performance$Accuracy, nn_performance$Accuracy),
-        kappa = c(rf_performance$Kappa, gbm_performance$Kappa, gam_performance$Kappa, nn_performance$Kappa),
-        precision = c(rf_performance$Precision, gbm_performance$Precision, gam_performance$Precision, nn_performance$Precision),
-        recall = c(rf_performance$Recall, gbm_performance$Recall, gam_performance$Recall, nn_performance$Recall),
-        f1 = c(rf_performance$F1, gbm_performance$F1, gam_performance$F1, nn_performance$F1)
-      ) %>% 
-        tidyr::pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value")
-      
-      return(performance_summary)
-    })
-    
-    # Combine and save evaluation results
-    performance_spec = bind_rows(performance_cv)
-    save(performance_spec, file = paste0("data/r_objects/ssdm_results/", species, ".RData"))
-  }
-)
-
-# ----------------------------------------------------------------------#
-# Combine results                                                    ####
-# ----------------------------------------------------------------------#
-files = list.files("data/r_objects/ssdm_results/", full.names = T)
-ssdm_results = lapply(files, function(f){load(f); return(performance_spec)}) %>% 
-  bind_rows() 
-
-save(ssdm_results, file = "data/r_objects/ssdm_results.RData")
diff --git a/R/04_02_modelling_msdm_embed.R b/R/04_02_modelling_msdm_embed.R
new file mode 100644
index 0000000..8257a3a
--- /dev/null
+++ b/R/04_02_modelling_msdm_embed.R
@@ -0,0 +1,112 @@
+library(dplyr)
+library(tidyr)
+library(cito)
+
+source("R/utils.R")
+
+load("data/r_objects/model_data.RData")
+
+model_data = model_data %>% 
+  dplyr::filter(!is.na(fold_eval)) %>% 
+  dplyr::mutate(species = as.factor(species)) %>% 
+  sf::st_drop_geometry()
+
+# ----------------------------------------------------------------------#
+# Train model                                                        ####
+# ----------------------------------------------------------------------#
+predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness")
+formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species, dim = 30, train = T, lambda = 0.0001)"))
+
+# 1. Cross validation
+for(fold in 1:5){
+  # Prepare data
+  train_data = dplyr::filter(model_data, fold_eval != fold)
+  
+  # Run model
+  converged = F
+  while(!converged){
+    msdm_embed_fit = dnn(
+      formula,
+      data = train_data,
+      hidden = c(400L, 400L, 400L),
+      loss = "binomial",
+      activation = c("sigmoid", "leaky_relu", "leaky_relu"),
+      epochs = 25000, 
+      lr = 0.01,   
+      baseloss = 1,
+      batchsize = nrow(train_data),
+      dropout = 0.3,
+      burnin = 500,
+      optimizer = config_optimizer("adam", weight_decay = 0.001),
+      lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
+      early_stopping = 300,
+      validation = 0.2,
+      device = "cuda"
+    )
+    
+    if(sum(!is.na(msdm_embed_fit$losses$valid_l)) > 5000){
+      converged = T
+    }
+  }
+  
+  save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold,".RData"))
+}
+
+# Full model
+msdm_embed_fit = dnn(
+  formula,
+  data = model_data,
+  hidden = c(400L, 400L, 400L),
+  loss = "binomial",
+  activation = c("sigmoid", "leaky_relu", "leaky_relu"),
+  epochs = 25000, 
+  lr = 0.01,   
+  baseloss = 1,
+  batchsize = nrow(model_data),
+  dropout = 0.3,
+  burnin = 500,
+  optimizer = config_optimizer("adam", weight_decay = 0.001),
+  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
+  early_stopping = 300,
+  validation = 0.2,
+  device = "cuda"
+)
+
+save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData"))
+
+# ----------------------------------------------------------------------#
+# Evaluate model                                                     ####
+# ----------------------------------------------------------------------#
+msdm_embed_performance = lapply(1:5, function(fold){
+  load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData"))
+  
+  test_data_split = model_data %>% 
+    dplyr::filter(fold_eval == fold) %>% 
+    dplyr::group_split(species)
+  
+  lapply(test_data_split, function(test_data_spec){
+    species = test_data_spec$species[1]
+    
+    performance = tryCatch({
+      evaluate_model(msdm_embed_fit, test_data_spec)
+    }, error = function(e){
+      list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, 
+           Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, 
+           TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_)
+    })
+    
+    performance_summary = performance %>% 
+      as_tibble() %>% 
+      mutate(
+        species = !!species,
+        obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
+        fold_eval = !!fold,
+        model = "MSDM_embed",
+      ) %>% 
+      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value")
+  }) %>% 
+    bind_rows()
+}) %>% 
+  bind_rows()
+
+save(msdm_embed_performance, file = paste0("data/r_objects/msdm_embed_performance.RData"))
diff --git a/R/04_02_msdm_multiclass.R b/R/04_02_msdm_multiclass.R
deleted file mode 100644
index b2af99b..0000000
--- a/R/04_02_msdm_multiclass.R
+++ /dev/null
@@ -1,68 +0,0 @@
-library(dplyr)
-library(tidyr)
-library(cito)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data.RData")
-
-# ----------------------------------------------------------------------#
-# Prepare data                                                       ####
-# ----------------------------------------------------------------------#
-train_data = dplyr::filter(model_data, present == 1, train == 1) # Use only presences for training
-test_data = dplyr::filter(model_data, train == 0)                 # Evaluate on presences + pseudo-absences for comparability with binary models
-
-predictors = paste0("layer_", 1:19)
-
-# ----------------------------------------------------------------------#
-# Train model                                                        ####
-# ----------------------------------------------------------------------#
-formula = as.formula(paste0("species ~ ", paste(predictors, collapse = '+')))
-msdm_fit_multiclass = dnn(
-  formula,
-  data = train_data,
-  hidden = c(200L, 200L, 200L),
-  loss = "softmax",
-  activation = c("leaky_relu", "leaky_relu", "leaky_relu"),
-  epochs = 5000L, 
-  lr = 0.01,
-  batchsize = nrow(train_data)/5,
-  dropout = 0.1,
-  burnin = 100,
-  optimizer = config_optimizer("adam", weight_decay = 0.001),
-  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
-  early_stopping = 250,
-  validation = 0.3,
-  device = "cuda"
-)
-
-save(msdm_fit_multiclass, file = "data/r_objects/msdm_fit_multiclass.RData")
-
-# ----------------------------------------------------------------------#
-# Evaluate model                                                     ####
-# ----------------------------------------------------------------------#
-data_split = test_data %>% 
-  group_by(species) %>% 
-  group_split()
-
-msdm_results_multiclass = lapply(data_split, function(data_spec){
-  msdm_performance = tryCatch({
-    evaluate_multiclass_model(msdm_fit_multiclass, data_spec, k = 10) # Top-k accuracy
-  }, error = function(e){
-    list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
-  })
-  
-  performance_summary = tibble(
-    species = data_spec$species[1],
-    obs = nrow(dplyr::filter(model_data, species == data_spec$species[1])),
-    model = "MSDM_multiclass",
-    auc = msdm_performance$AUC,
-    accuracy = msdm_performance$Accuracy,
-    kappa = msdm_performance$Kappa,
-    precision = msdm_performance$Precision,
-    recall = msdm_performance$Recall,
-    f1 = msdm_performance$F1
-  )
-}) %>% bind_rows()
-
-save(msdm_results_multiclass, file = "data/r_objects/msdm_results_multiclass.RData")
diff --git a/R/04_03_msdm_embed_raw.R b/R/04_03_msdm_embed_raw.R
deleted file mode 100644
index 0775329..0000000
--- a/R/04_03_msdm_embed_raw.R
+++ /dev/null
@@ -1,81 +0,0 @@
-library(dplyr)
-library(tidyr)
-library(cito)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data_pa_sampling.RData")
-
-model_data = model_data %>% 
-  dplyr::mutate(species_int = as.integer(as.factor(model_data$species))) %>% 
-  sf::st_drop_geometry()
-  
-fold = 1
-
-test_data = dplyr::filter(model_data, fold_eval == fold)
-train_data = dplyr::filter(model_data, fold_eval != fold)
-
-# ----------------------------------------------------------------------#
-# Train model                                                        ####
-# ----------------------------------------------------------------------#
-predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness")
-formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, dim = 50, train = T, lambda = 0.0001)"))
-
-plot(1, type="n", xlab="", ylab="", xlim=c(0, 25000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
-msdm_fit_embedding_raw = dnn(
-  formula,
-  data = train_data,
-  hidden = c(250L, 250L, 250L),
-  loss = "binomial",
-  activation = c("sigmoid", "leaky_relu", "leaky_relu"),
-  epochs = 10L, 
-  lr = 0.1,   
-  baseloss = 1,
-  batchsize = 4096,
-  dropout = 0.1,
-  burnin = 500,
-  lambda = 0.0001,
-  optimizer = config_optimizer("adam", weight_decay = 0.001),
-  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 150, factor = 0.7),
-  early_stopping = 400,
-  validation = 0.2,
-  device = "cuda"
-)
-
-save(msdm_fit_embedding_raw, file = paste0("data/r_objects/msdm_raw_results/msdm_raw_fold", fold,".RData"))
-
-# ----------------------------------------------------------------------#
-# Evaluate model                                                     ####
-# ----------------------------------------------------------------------#
-load("data/r_objects/msdm_fit_embedding_raw.RData")
-
-data_split = test_data %>% 
-  split(test_data$species)
-
-msdm_results_embedding_raw = lapply(data_split, function(pa_spec){
-  species = pa_spec$species[1]
-  pa_spec = pa_spec %>% 
-    dplyr::select(-species, -fold_eval)
-  
-  msdm_performance = tryCatch({
-    evaluate_model(msdm_fit_embedding_raw, pa_spec)
-  }, error = function(e){
-    list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
-  })
-  
-  performance_summary = tibble(
-    species = !!species,
-    obs = nrow(dplyr::filter(train_data, species == !!species)),
-    fold_eval = !!fold,
-    model = "MSDM_embed_raw",
-    auc = msdm_performance$AUC,
-    accuracy = msdm_performance$Accuracy,
-    kappa = msdm_performance$Kappa,
-    precision = msdm_performance$Precision,
-    recall = msdm_performance$Recall,
-    f1 = msdm_performance$F1
-  ) %>% 
-    tidyr::pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value")
-}) %>% bind_rows()
-
-save(msdm_results_embedding_raw, file = paste0("data/r_objects/msdm_raw_results/result_msdm_raw_fold", fold,".RData"))
diff --git a/R/04_04_msdm_embed_traits.R b/R/04_04_msdm_embed_traits.R
deleted file mode 100644
index fce4d55..0000000
--- a/R/04_04_msdm_embed_traits.R
+++ /dev/null
@@ -1,143 +0,0 @@
-library(dplyr)
-library(tidyr)
-library(cito)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data.RData")
-load("data/r_objects/func_dist.RData")
-
-# ----------------------------------------------------------------------#
-# Prepare data                                                       ####
-# ----------------------------------------------------------------------#
-model_species = intersect(model_data$species, names(func_dist)) 
-
-model_data_final = model_data %>%
-  dplyr::filter(species %in% !!model_species) %>% 
-  # dplyr::mutate_at(vars(starts_with("layer")), ~as.vector(scale(.))) %>%  # Scaling seems to make things worse often
-  dplyr::mutate(species_int = as.integer(as.factor(species)))
-
-train_data = dplyr::filter(model_data_final, train == 1)
-test_data = dplyr::filter(model_data_final, train == 0)
-
-sp_ind = match(model_species, names(func_dist))
-func_dist = as.matrix(func_dist)[sp_ind, sp_ind]
-
-embeddings = eigen(func_dist)$vectors[,1:20]
-predictors = paste0("layer_", 1:19)
-
-# ----------------------------------------------------------------------#
-# Without training the embedding                                     ####
-# ----------------------------------------------------------------------#
-# 1. Train
-formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)"))
-
-plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
-msdm_fit_embedding_traits_static = dnn(
-  formula,
-  data = train_data,
-  hidden = c(500L, 500L, 500L),
-  loss = "binomial",
-  activation = c("sigmoid", "leaky_relu", "leaky_relu"),
-  epochs = 15000L, 
-  lr = 0.01,   
-  baseloss = 1,
-  batchsize = nrow(train_data),
-  dropout = 0.1,
-  burnin = 100,
-  optimizer = config_optimizer("adam", weight_decay = 0.001),
-  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
-  early_stopping = 250,
-  validation = 0.3,
-  device = "cuda",
-)
-save(msdm_fit_embedding_traits_static, file = "data/r_objects/msdm_fit_embedding_traits_static.RData")
-
-# 2. Evaluate
-# Per species
-load("data/r_objects/msdm_fit_embedding_traits_static.RData")
-data_split = test_data %>% 
-  group_by(species_int) %>% 
-  group_split()
-
-msdm_results_embedding_traits_static = lapply(data_split, function(data_spec){
-  target_species =  data_spec$species[1]
-  data_spec = dplyr::select(data_spec, -species)
-  
-  msdm_performance = tryCatch({
-    evaluate_model(msdm_fit_embedding_traits_static, data_spec)
-  }, error = function(e){
-    list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
-  })
-  
-  performance_summary = tibble(
-    species = !!target_species,
-    obs = length(which(model_data$species == target_species)),
-    model = "MSDM_embed_informed_traits_static",
-    auc = msdm_performance$AUC,
-    accuracy = msdm_performance$Accuracy,
-    kappa = msdm_performance$Kappa,
-    precision = msdm_performance$Precision,
-    recall = msdm_performance$Recall,
-    f1 = msdm_performance$F1
-  )
-}) %>% bind_rows()
-
-save(msdm_results_embedding_traits_static, file = "data/r_objects/msdm_results_embedding_traits_static.RData")
-
-# -------------------------------------------------------------------#
-# With training the embedding                                     ####
-# ------------------------------------------------------------ ------#
-formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)"))
-
-plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
-msdm_fit_embedding_traits_trained = dnn(
-  formula,
-  data = train_data,
-  hidden = c(500L, 500L, 500L),
-  loss = "binomial",
-  activation = c("sigmoid", "leaky_relu", "leaky_relu"),
-  epochs = 15000L, 
-  lr = 0.01,   
-  baseloss = 1,
-  batchsize = nrow(train_data),
-  dropout = 0.1,
-  burnin = 100,
-  optimizer = config_optimizer("adam", weight_decay = 0.001),
-  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
-  early_stopping = 250,
-  validation = 0.3,
-  device = "cuda",
-)
-save(msdm_fit_embedding_traits_trained, file = "data/r_objects/msdm_fit_embedding_traits_trained.RData")
-
-# 2. Evaluate
-load("data/r_objects/msdm_fit_embedding_traits_trained.RData")
-data_split = test_data %>% 
-  group_by(species_int) %>% 
-  group_split()
-
-msdm_results_embedding_traits_trained = lapply(data_split, function(data_spec){
-  target_species =  data_spec$species[1]
-  data_spec = dplyr::select(data_spec, -species)
-  
-  msdm_performance = tryCatch({
-    evaluate_model(msdm_fit_embedding_traits_trained, data_spec)
-  }, error = function(e){
-    list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
-  })
-  
-  performance_summary = tibble(
-    species = !!target_species,
-    obs = length(which(model_data$species == target_species)),
-    model = "MSDM_embed_informed_traits_trained",
-    auc = msdm_performance$AUC,
-    accuracy = msdm_performance$Accuracy,
-    kappa = msdm_performance$Kappa,
-    precision = msdm_performance$Precision,
-    recall = msdm_performance$Recall,
-    f1 = msdm_performance$F1
-  )
-}) %>% bind_rows()
-
-save(msdm_results_embedding_traits_trained, file = "data/r_objects/msdm_results_embedding_traits_trained.RData")
diff --git a/R/04_05_msdm_embed_phylo.R b/R/04_05_msdm_embed_phylo.R
deleted file mode 100644
index 11134f3..0000000
--- a/R/04_05_msdm_embed_phylo.R
+++ /dev/null
@@ -1,143 +0,0 @@
-library(dplyr)
-library(tidyr)
-library(cito)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data.RData")
-load("data/r_objects/phylo_dist.RData")
-
-# ----------------------------------------------------------------------#
-# Prepare data                                                       ####
-# ----------------------------------------------------------------------#
-model_species = intersect(model_data$species, colnames(phylo_dist)) 
-
-model_data_final = model_data %>%
-  dplyr::filter(species %in% !!model_species) %>% 
-  # dplyr::mutate_at(vars(starts_with("layer")), scale) %>%  # Scaling seems to make things worse often
-  dplyr::mutate(species_int = as.integer(as.factor(species)))
-
-train_data = dplyr::filter(model_data_final, train == 1)
-test_data = dplyr::filter(model_data_final, train == 0)
-
-sp_ind = match(model_species, colnames(phylo_dist))
-phylo_dist = phylo_dist[sp_ind, sp_ind]
-
-embeddings = eigen(phylo_dist)$vectors[,1:20]
-predictors = paste0("layer_", 1:19)
-
-# ----------------------------------------------------------------------#
-# Without training the embedding                                     ####
-# ----------------------------------------------------------------------#
-# 1. Train
-formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)"))
-
-plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
-msdm_fit_embedding_phylo_static = dnn(
-  formula,
-  data = train_data,
-  hidden = c(500L, 500L, 500L),
-  loss = "binomial",
-  activation = c("sigmoid", "leaky_relu", "leaky_relu"),
-  epochs = 15000L, 
-  lr = 0.01,   
-  baseloss = 1,
-  batchsize = nrow(train_data),
-  dropout = 0.1,
-  burnin = 100,
-  optimizer = config_optimizer("adam", weight_decay = 0.001),
-  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
-  early_stopping = 250,
-  validation = 0.3,
-  device = "cuda",
-)
-save(msdm_fit_embedding_phylo_static, file = "data/r_objects/msdm_fit_embedding_phylo_static.RData")
-
-# 2. Evaluate
-# Per species
-load("data/r_objects/msdm_fit_embedding_phylo_static.RData")
-data_split = test_data %>% 
-  group_by(species_int) %>% 
-  group_split()
-
-msdm_results_embedding_phylo_static = lapply(data_split, function(data_spec){
-  target_species =  data_spec$species[1]
-  data_spec = dplyr::select(data_spec, -species)
-  
-  msdm_performance = tryCatch({
-    evaluate_model(msdm_fit_embedding_phylo_static, data_spec)
-  }, error = function(e){
-    list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
-  })
-  
-  performance_summary = tibble(
-    species = !!target_species,
-    obs = length(which(model_data$species == target_species)),
-    model = "MSDM_embed_informed_phylo_static",
-    auc = msdm_performance$AUC,
-    accuracy = msdm_performance$Accuracy,
-    kappa = msdm_performance$Kappa,
-    precision = msdm_performance$Precision,
-    recall = msdm_performance$Recall,
-    f1 = msdm_performance$F1
-  )
-}) %>% bind_rows()
-
-save(msdm_results_embedding_phylo_static, file = "data/r_objects/msdm_results_embedding_phylo_static.RData")
-
-# -------------------------------------------------------------------#
-# With training the embedding                                     ####
-# ------------------------------------------------------------ ------#
-formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)"))
-
-plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
-msdm_fit_embedding_phylo_trained = dnn(
-  formula,
-  data = train_data,
-  hidden = c(500L, 500L, 500L),
-  loss = "binomial",
-  activation = c("sigmoid", "leaky_relu", "leaky_relu"),
-  epochs = 15000L, 
-  lr = 0.01,   
-  baseloss = 1,
-  batchsize = nrow(train_data),
-  dropout = 0.1,
-  burnin = 100,
-  optimizer = config_optimizer("adam", weight_decay = 0.001),
-  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
-  early_stopping = 250,
-  validation = 0.3,
-  device = "cuda",
-)
-save(msdm_fit_embedding_phylo_trained, file = "data/r_objects/msdm_fit_embedding_phylo_trained.RData")
-
-# 2. Evaluate
-load("data/r_objects/msdm_fit_embedding_phylo_trained.RData")
-data_split = test_data %>% 
-  group_by(species_int) %>% 
-  group_split()
-
-msdm_results_embedding_phylo_trained = lapply(data_split, function(data_spec){
-  target_species =  data_spec$species[1]
-  data_spec = dplyr::select(data_spec, -species)
-  
-  msdm_performance = tryCatch({
-    evaluate_model(msdm_fit_embedding_phylo_trained, data_spec)
-  }, error = function(e){
-    list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
-  })
-  
-  performance_summary = tibble(
-    species = !!target_species,
-    obs = length(which(model_data$species == target_species)),
-    model = "MSDM_embed_informed_phylo_trained",
-    auc = msdm_performance$AUC,
-    accuracy = msdm_performance$Accuracy,
-    kappa = msdm_performance$Kappa,
-    precision = msdm_performance$Precision,
-    recall = msdm_performance$Recall,
-    f1 = msdm_performance$F1
-  )
-}) %>% bind_rows()
-
-save(msdm_results_embedding_phylo_trained, file = "data/r_objects/msdm_results_embedding_phylo_trained.RData")
diff --git a/R/04_06_msdm_embed_ranges.R b/R/04_06_msdm_embed_ranges.R
deleted file mode 100644
index b9bd65d..0000000
--- a/R/04_06_msdm_embed_ranges.R
+++ /dev/null
@@ -1,144 +0,0 @@
-library(dplyr)
-library(tidyr)
-library(cito)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data.RData")
-load("data/r_objects/range_dist.RData")
-
-# ----------------------------------------------------------------------#
-# Prepare data                                                       ####
-# ----------------------------------------------------------------------#
-model_species = intersect(model_data$species, colnames(range_dist)) 
-
-model_data_final = model_data %>%
-  dplyr::filter(species %in% !!model_species) %>% 
-  # dplyr::mutate_at(vars(starts_with("layer")), scale) %>%  # Scaling seems to make things worse often
-  dplyr::mutate(species_int = as.integer(as.factor(species)))
-
-train_data = dplyr::filter(model_data_final, train == 1)
-test_data = dplyr::filter(model_data_final, train == 0)
-
-sp_ind = match(model_species, colnames(range_dist))
-range_dist = range_dist[sp_ind, sp_ind]
-
-embeddings = eigen(range_dist)$vectors[,1:20]
-mode(embeddings) = "numeric"
-predictors = paste0("layer_", 1:19)
-
-# ----------------------------------------------------------------------#
-# Without training the embedding                                     ####
-# ----------------------------------------------------------------------#
-# 1. Train
-formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)"))
-
-plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
-msdm_fit_embedding_range_static = dnn(
-  formula,
-  data = train_data,
-  hidden = c(500L, 500L, 500L),
-  loss = "binomial",
-  activation = c("sigmoid", "leaky_relu", "leaky_relu"),
-  epochs = 15000L, 
-  lr = 0.01,   
-  baseloss = 1,
-  batchsize = nrow(train_data),
-  dropout = 0.1,
-  burnin = 100,
-  optimizer = config_optimizer("adam", weight_decay = 0.001),
-  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
-  early_stopping = 250,
-  validation = 0.3,
-  device = "cuda",
-)
-save(msdm_fit_embedding_range_static, file = "data/r_objects/msdm_fit_embedding_range_static.RData")
-
-# 2. Evaluate
-# Per species
-load("data/r_objects/msdm_fit_embedding_range_static.RData")
-data_split = test_data %>% 
-  group_by(species_int) %>% 
-  group_split()
-
-msdm_results_embedding_range_static = lapply(data_split, function(data_spec){
-  target_species =  data_spec$species[1]
-  data_spec = dplyr::select(data_spec, -species)
-  
-  msdm_performance = tryCatch({
-    evaluate_model(msdm_fit_embedding_range_static, data_spec)
-  }, error = function(e){
-    list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
-  })
-  
-  performance_summary = tibble(
-    species = !!target_species,
-    obs = length(which(model_data$species == target_species)),
-    model = "MSDM_embed_informed_range_static",
-    auc = msdm_performance$AUC,
-    accuracy = msdm_performance$Accuracy,
-    kappa = msdm_performance$Kappa,
-    precision = msdm_performance$Precision,
-    recall = msdm_performance$Recall,
-    f1 = msdm_performance$F1
-  )
-}) %>% bind_rows()
-
-save(msdm_results_embedding_range_static, file = "data/r_objects/msdm_results_embedding_range_static.RData")
-
-# -------------------------------------------------------------------#
-# With training the embedding                                     ####
-# ------------------------------------------------------------ ------#
-formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)"))
-
-plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
-msdm_fit_embedding_range_trained = dnn(
-  formula,
-  data = train_data,
-  hidden = c(500L, 500L, 500L),
-  loss = "binomial",
-  activation = c("sigmoid", "leaky_relu", "leaky_relu"),
-  epochs = 15000L, 
-  lr = 0.01,   
-  baseloss = 1,
-  batchsize = nrow(train_data),
-  dropout = 0.1,
-  burnin = 100,
-  optimizer = config_optimizer("adam", weight_decay = 0.001),
-  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
-  early_stopping = 250,
-  validation = 0.3,
-  device = "cuda",
-)
-save(msdm_fit_embedding_range_trained, file = "data/r_objects/msdm_fit_embedding_range_trained.RData")
-
-# 2. Evaluate
-load("data/r_objects/msdm_fit_embedding_range_trained.RData")
-data_split = test_data %>% 
-  group_by(species_int) %>% 
-  group_split()
-
-msdm_results_embedding_range_trained = lapply(data_split, function(data_spec){
-  target_species =  data_spec$species[1]
-  data_spec = dplyr::select(data_spec, -species)
-  
-  msdm_performance = tryCatch({
-    evaluate_model(msdm_fit_embedding_range_trained, data_spec)
-  }, error = function(e){
-    list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
-  })
-  
-  performance_summary = tibble(
-    species = !!target_species,
-    obs = length(which(model_data$species == target_species)),
-    model = "MSDM_embed_informed_range_trained",
-    auc = msdm_performance$AUC,
-    accuracy = msdm_performance$Accuracy,
-    kappa = msdm_performance$Kappa,
-    precision = msdm_performance$Precision,
-    recall = msdm_performance$Recall,
-    f1 = msdm_performance$F1
-  )
-}) %>% bind_rows()
-
-save(msdm_results_embedding_range_trained, file = "data/r_objects/msdm_results_embedding_range_trained.RData")
diff --git a/R/04_07_msdm_embed_multi_nolonlat.R b/R/04_07_msdm_embed_multi_nolonlat.R
deleted file mode 100644
index 15d37e9..0000000
--- a/R/04_07_msdm_embed_multi_nolonlat.R
+++ /dev/null
@@ -1,111 +0,0 @@
-library(dplyr)
-library(tidyr)
-library(cito)
-library(sf)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data_pa_sampling.RData")
-load("data/r_objects/func_dist.RData")
-load("data/r_objects/phylo_dist.RData")
-load("data/r_objects/range_dist.RData")
-
-# ----------------------------------------------------------------------#
-# Prepare data                                                       ####
-# ----------------------------------------------------------------------#
-model_species = Reduce(
-  intersect, 
-  list(unique(model_data$species), colnames(range_dist), colnames(phylo_dist), colnames(func_dist))
-) 
-
-model_data_final = model_data %>%
-  sf::st_drop_geometry() %>% 
-  dplyr::filter(species %in% !!model_species) %>% 
-  dplyr::mutate(species_int = as.integer(as.factor(species)))
-  
-
-test_data = dplyr::filter(model_data_final, fold_eval == 1)
-train_data = dplyr::filter(model_data_final, fold_eval != 1)
-
-# Create embeddings
-func_ind = match(model_species, colnames(func_dist))
-func_dist = func_dist[func_ind, func_ind]
-func_embeddings = eigen(func_dist)$vectors[,1:20]
-
-phylo_ind = match(model_species, colnames(phylo_dist))
-phylo_dist = phylo_dist[phylo_ind, phylo_ind]
-phylo_embeddings = eigen(phylo_dist)$vectors[,1:20]
-
-range_ind = match(model_species, colnames(range_dist))
-range_dist = range_dist[range_ind, range_ind]
-range_embeddings = eigen(range_dist)$vectors[,1:20]
-
-
-# ----------------------------------------------------------------------#
-# Train model                                                        ####
-# ----------------------------------------------------------------------#
-# Define predictors
-predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness")
-
-formula = as.formula(
-  paste0("present ~ ", 
-         paste(predictors, collapse = '+'),  
-         " + e(species_int, weights = func_embeddings, lambda = 0.00001, train = T)",
-         " + e(species_int, weights = phylo_embeddings, lambda = 0.00001, train = T)",
-         " + e(species_int, weights = range_embeddings, lambda = 0.00001, train = T)"
-  )
-)
-
-plot(1, type="n", xlab="", ylab="", xlim=c(0, 25000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
-msdm_fit_embedding_multi_nolonlat = dnn(
-  formula,
-  data = train_data,
-  hidden = c(200L, 200L, 200L),
-  loss = "binomial",
-  activation = c("sigmoid", "leaky_relu", "leaky_relu"),
-  epochs = 30000L, 
-  lr = 0.01,   
-  baseloss = 1,
-  batchsize = nrow(train_data),
-  dropout = 0.1,
-  burnin = 100,
-  optimizer = config_optimizer("adam", weight_decay = 0.001),
-  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
-  early_stopping = 250,
-  validation = 0.3,
-  device = "cuda",
-)
-save(msdm_fit_embedding_multi_nolonlat, file = "data/r_objects/msdm_fit_embedding_multi_nolonlat.RData")
-
-# ----------------------------------------------------------------------#
-# Evaluate results                                                   ####
-# ----------------------------------------------------------------------#
-load("data/r_objects/msdm_fit_embedding_multi_nolonlat.RData")
-data_split = test_data %>% 
-  group_by(species_int) %>% 
-  group_split()
-
-msdm_results_embedding_multi_nolonlat = lapply(data_split, function(data_spec){
-  target_species =  data_spec$species[1]
-  data_spec = dplyr::select(data_spec, -species)
-  
-  msdm_performance = tryCatch({
-    evaluate_model(msdm_fit_embedding_multi_nolonlat, data_spec)
-  }, error = function(e){
-    list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
-  })
-  
-  performance_summary = tibble(
-    species = !!target_species,
-    obs = length(which(model_data$species == target_species)),
-    model = "MSDM_embed_informed_multi_nolonlat",
-    auc = msdm_performance$AUC,
-    accuracy = msdm_performance$Accuracy,
-    kappa = msdm_performance$Kappa,
-    precision = msdm_performance$Precision,
-    recall = msdm_performance$Recall,
-    f1 = msdm_performance$F1
-  )
-}) %>% bind_rows()
-
-save(msdm_results_embedding_multi_nolonlat, file = "data/r_objects/msdm_results_embedding_multi_nolonlat.RData")
\ No newline at end of file
diff --git a/R/04_07_msdm_oneoff.R b/R/04_07_msdm_oneoff.R
deleted file mode 100644
index 4ef2a03..0000000
--- a/R/04_07_msdm_oneoff.R
+++ /dev/null
@@ -1,111 +0,0 @@
-library(dplyr)
-library(tidyr)
-library(cito)
-library(sf)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data_pa_sampling.RData")
-load("data/r_objects/func_dist.RData")
-load("data/r_objects/phylo_dist.RData")
-load("data/r_objects/range_dist.RData")
-
-# ----------------------------------------------------------------------#
-# Prepare data                                                       ####
-# ----------------------------------------------------------------------#
-model_species = Reduce(
-  intersect, 
-  list(unique(model_data$species), colnames(range_dist), colnames(phylo_dist), colnames(func_dist))
-) 
-
-model_data_final = model_data %>%
-  sf::st_drop_geometry() %>% 
-  dplyr::filter(species %in% !!model_species) %>% 
-  dplyr::mutate(species_int = as.integer(as.factor(species)))
-  
-
-test_data = dplyr::filter(model_data_final, fold_eval == 1)
-train_data = dplyr::filter(model_data_final, fold_eval != 1)
-
-# Create embeddings
-func_ind = match(model_species, colnames(func_dist))
-func_dist = func_dist[func_ind, func_ind]
-func_embeddings = eigen(func_dist)$vectors[,1:20]
-
-phylo_ind = match(model_species, colnames(phylo_dist))
-phylo_dist = phylo_dist[phylo_ind, phylo_ind]
-phylo_embeddings = eigen(phylo_dist)$vectors[,1:20]
-
-range_ind = match(model_species, colnames(range_dist))
-range_dist = range_dist[range_ind, range_ind]
-range_embeddings = eigen(range_dist)$vectors[,1:20]
-
-
-# ----------------------------------------------------------------------#
-# Train model                                                        ####
-# ----------------------------------------------------------------------#
-# Define predictors
-predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness")
-
-formula = as.formula(
-  paste0("present ~ ", 
-         paste(predictors, collapse = '+'),  
-         " + e(species_int, weights = func_embeddings, lambda = 0.00001, train = T)",
-         " + e(species_int, weights = phylo_embeddings, lambda = 0.00001, train = T)",
-         " + e(species_int, weights = range_embeddings, lambda = 0.00001, train = T)"
-  )
-)
-
-plot(1, type="n", xlab="", ylab="", xlim=c(0, 25000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
-msdm_fit_embedding_multi_nolonlat = dnn(
-  formula,
-  data = train_data,
-  hidden = c(400L, 400L, 400L),
-  loss = "binomial",
-  activation = c("sigmoid", "leaky_relu", "leaky_relu"),
-  epochs = 30000L, 
-  lr = 0.01,   
-  baseloss = 1,
-  batchsize = nrow(train_data),
-  dropout = 0.1,
-  burnin = 100,
-  optimizer = config_optimizer("adam", weight_decay = 0.001),
-  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 150, factor = 0.7),
-  early_stopping = 250,
-  validation = 0.3,
-  device = "cuda",
-)
-save(msdm_fit_embedding_multi_nolonlat, file = "data/r_objects/msdm_fit_embedding_multi_nolonlat.RData")
-
-# ----------------------------------------------------------------------#
-# Evaluate results                                                   ####
-# ----------------------------------------------------------------------#
-load("data/r_objects/msdm_fit_embedding_multi_nolonlat.RData")
-data_split = test_data %>% 
-  group_by(species_int) %>% 
-  group_split()
-
-msdm_results_embedding_multi_nolonlat = lapply(data_split, function(data_spec){
-  target_species =  data_spec$species[1]
-  data_spec = dplyr::select(data_spec, -species)
-  
-  msdm_performance = tryCatch({
-    evaluate_model(msdm_fit_embedding_multi_nolonlat, data_spec)
-  }, error = function(e){
-    list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
-  })
-  
-  performance_summary = tibble(
-    species = !!target_species,
-    obs = length(which(model_data$species == target_species)),
-    model = "MSDM_embed_informed_multi_nolonlat",
-    auc = msdm_performance$AUC,
-    accuracy = msdm_performance$Accuracy,
-    kappa = msdm_performance$Kappa,
-    precision = msdm_performance$Precision,
-    recall = msdm_performance$Recall,
-    f1 = msdm_performance$F1
-  )
-}) %>% bind_rows()
-
-save(msdm_results_embedding_multi_nolonlat, file = "data/r_objects/msdm_results_embedding_multi_nolonlat.RData")
diff --git a/R/04_08_msdm_embed_multi_lonlat.R b/R/04_08_msdm_embed_multi_lonlat.R
deleted file mode 100644
index 2f2e03f..0000000
--- a/R/04_08_msdm_embed_multi_lonlat.R
+++ /dev/null
@@ -1,107 +0,0 @@
-library(dplyr)
-library(tidyr)
-library(cito)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data.RData")
-load("data/r_objects/func_dist.RData")
-load("data/r_objects/phylo_dist.RData")
-load("data/r_objects/range_dist.RData")
-
-# ----------------------------------------------------------------------#
-# Prepare data                                                       ####
-# ----------------------------------------------------------------------#
-model_species = Reduce(
-  intersect, 
-  list(unique(model_data$species), colnames(range_dist), colnames(phylo_dist), colnames(func_dist))
-) 
-
-model_data_final = model_data %>%
-  dplyr::filter(species %in% !!model_species) %>% 
-  dplyr::mutate(species_int = as.integer(as.factor(species)))
-
-train_data = dplyr::filter(model_data_final, train == 1)
-test_data = dplyr::filter(model_data_final, train == 0)
-
-# Create embeddings
-func_ind = match(model_species, colnames(func_dist))
-func_dist = func_dist[func_ind, func_ind]
-func_embeddings = eigen(func_dist)$vectors[,1:20]
-
-phylo_ind = match(model_species, colnames(phylo_dist))
-phylo_dist = phylo_dist[phylo_ind, phylo_ind]
-phylo_embeddings = eigen(phylo_dist)$vectors[,1:20]
-
-range_ind = match(model_species, colnames(range_dist))
-range_dist = range_dist[range_ind, range_ind]
-range_embeddings = eigen(range_dist)$vectors[,1:20]
-
-
-# ----------------------------------------------------------------------#
-# Train model                                                        ####
-# ----------------------------------------------------------------------#
-predictors = c("longitude", "latitude", paste0("layer_", 1:19))
-
-formula = as.formula(
-  paste0("present ~ ", 
-         paste(predictors, collapse = '+'),  
-         " + e(species_int, weights = func_embeddings, lambda = 0.00001, train = F)",
-         " + e(species_int, weights = phylo_embeddings, lambda = 0.00001, train = F)",
-         " + e(species_int, weights = range_embeddings, lambda = 0.00001, train = F)"
-  )
-)
-
-plot(1, type="n", xlab="", ylab="", xlim=c(0, 25000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
-msdm_fit_embedding_multi_lonlat = dnn(
-  formula,
-  data = train_data,
-  hidden = c(500L, 500L, 500L),
-  loss = "binomial",
-  activation = c("sigmoid", "leaky_relu", "leaky_relu"),
-  epochs = 30000L, 
-  lr = 0.01,   
-  baseloss = 1,
-  batchsize = nrow(train_data),
-  dropout = 0.1,
-  burnin = 100,
-  optimizer = config_optimizer("adam", weight_decay = 0.001),
-  lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
-  early_stopping = 250,
-  validation = 0.3,
-  device = "cuda",
-)
-save(msdm_fit_embedding_multi_lonlat, file = "data/r_objects/msdm_fit_embedding_multi_lonlat.RData")
-
-# ----------------------------------------------------------------------#
-# Evaluate results                                                   ####
-# ----------------------------------------------------------------------#
-load("data/r_objects/msdm_fit_embedding_multi_lonlat.RData")
-data_split = test_data %>% 
-  group_by(species_int) %>% 
-  group_split()
-
-msdm_results_embedding_multi_lonlat = lapply(data_split, function(data_spec){
-  target_species =  data_spec$species[1]
-  data_spec = dplyr::select(data_spec, -species)
-  
-  msdm_performance = tryCatch({
-    evaluate_model(msdm_fit_embedding_multi_lonlat, data_spec)
-  }, error = function(e){
-    list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
-  })
-  
-  performance_summary = tibble(
-    species = !!target_species,
-    obs = length(which(model_data$species == target_species)),
-    model = "MSDM_embed_informed_multi_lonlat",
-    auc = msdm_performance$AUC,
-    accuracy = msdm_performance$Accuracy,
-    kappa = msdm_performance$Kappa,
-    precision = msdm_performance$Precision,
-    recall = msdm_performance$Recall,
-    f1 = msdm_performance$F1
-  )
-}) %>% bind_rows()
-
-save(msdm_results_embedding_multi_lonlat, file = "data/r_objects/msdm_results_embedding_multi_lonlat.RData")
\ No newline at end of file
diff --git a/R/05_01_performance_analysis.qmd b/R/05_01_performance_analysis.qmd
deleted file mode 100644
index 17be731..0000000
--- a/R/05_01_performance_analysis.qmd
+++ /dev/null
@@ -1,627 +0,0 @@
----
-title: "SDM Performance analysis"
-format: html
-editor: visual
-engine: knitr
----
-
-```{r init, echo = FALSE, include = FALSE}
-library(tidyverse)
-library(sf)
-library(plotly)
-library(DT)
-
-load("../data/r_objects/model_data_pa_sampling.RData")
-load("../data/r_objects/ssdm_results.RData")
-load("../data/r_objects/msdm_results_embedding_raw.RData")
-
-load("../data/r_objects/range_maps.RData")
-load("../data/r_objects/range_maps_gridded.RData")
-load("../data/r_objects/occs_final.RData")
-load("../data/r_objects/functional_groups.RData")
-sf::sf_use_s2(use_s2 = FALSE)
-```
-
-```{r globals, echo = FALSE, include = FALSE}
-
-
-# Count occs per species
-obs_count = model_data %>% 
-  sf::st_drop_geometry() %>% 
-  dplyr::filter(present == 1) %>% 
-  dplyr::group_by(species) %>% 
-  dplyr::summarise(obs = n())
-
-
-# Regression functions
-asym_regression = function(x, y){
-  nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1))
-  new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100))
-  data.frame(
-    x = new_x,
-    fit = predict(nls_fit, newdata = data.frame(x = new_x))
-  )
-}
-
-lin_regression = function(x, y, family = "binomial"){
-  glm_fit = suppressWarnings(glm(y~x, family = family))
-  new_x = seq(min(x), max(x), length.out = 100)
-  data.frame(
-    x = new_x,
-    fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response")
-  )
-}
-
-msdm_results = msdm_results_embedding_raw %>% 
-  pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value") %>% 
-  dplyr::select(-obs) %>% 
-  dplyr::mutate(
-    fold_eval = 1
-  ) %>% 
-  drop_na()
-
-# Performance table
-performance = ssdm_results %>% 
-  dplyr::select(-obs) %>% 
-  dplyr::filter(fold_eval == 1, species %in% msdm_results$species) %>%  # Only look at first fold
-  bind_rows(msdm_results) %>% 
-  ungroup() %>% 
-  dplyr::mutate(
-    value = case_when(
-      ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accurracy", "precision", "recall")) ~ 0.5,
-      ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0,
-      .default = value
-    )
-  )
-
-focal_metrics = unique(performance$metric)
-```
-
-## Summary
-
-This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest.
-
-Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling).
-
-### Modeling overview:
-
-#### General decisions
-
--   Randomly sampled pseudo-absences from expanded area of extent of occurrence records (×1.25)
--   Balanced presences and absences for each species
--   Predictors: all 19 CHELSA bioclim variables
--   70/30 Split of training vs. test data (except for NN models)
-
-#### sSDM Algorithms
-
-Random Forest (**SSDM_RF**)
-
--   Hyperparameter tuning of `mtry`
--   Spatial block cross-validation during training
-
-Generalized boosted machine (**SSDM_GBM**)
-
--   Hyperparameter tuning across `n.trees` , `interaction.depth` , `shrinkage`, `n.minobsinnode`
--   Spatial block cross-validation during training
-
-Generalized Linear Model (**SSDM_GLM**)
-
--   Logistic model with binomial link function
--   Spatial block cross-validation during training
-
-Neural Netwok (**SSDM_NN**)
-
--   Three hidden layers, leaky ReLu activations, binomial loss
--   no spatial block cross-validation during training
-
-#### mSDM Algorithms
-
-Binary Neural Network with species embedding (**MSDM_embed**)
-
--   definition: presence \~ environment + embedding(species)
--   prediction: probability of occurrence given a set of (environmental) inputs and species identity
--   embedding initialized at random
--   three hidden layers, sigmoid + leaky ReLu activations, binomial loss
-
-Binary Neural Network with trait-informed species embedding (**MSDM_embed_informed_trained**)
-
--   definition: presence \~ environment + embedding(species)
--   prediction: probability of occurrence given a set of (environmental) inputs and species identity
--   embedding initialized using eigenvectors of functional distance matrix, then further training on data
--   three hidden layers, sigmoid + leaky ReLu activations, binomial loss
-
-Multi-Class Neural Network (**MSDM_multiclass**)
-
--   definition: species identity \~ environment
--   prediction: probability distribution across all observed species given a set of (environmental) inputs
--   presence-only data in training
--   three hidden layers, leaky ReLu activations, softmax loss
--   Top-k based evaluation (k=10, P/A \~ target species in / not among top 10 predictions)
-
-### Key findings:
-
--   sSDM algorithms (RF, GBM) outperformed mSDMs in most cases
--   mSDMs showed indications of better performance for rare species (\< 10-20 occurrences)
--   More occurrence records and larger range sizes tended to improve model performance
--   Higher range coverage correlated with better performance
--   Range coverage bias and functional group showed some impact but were less consistent
--   Convergence problems hampered NN sSDM performance
-
-## Analysis
-
-The table below shows the analysed modeling results.
-
-```{r performance, echo = FALSE, message=FALSE, warnings=FALSE}
-DT::datatable(performance) %>% 
-  formatRound(columns="value", digits=3)
-```
-
-### Number of records
-
--   Model performance was generally better for species with more observations
--   Very poor performance below 50-100 observations
-
-```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE}
-plot_performance_over_frequency = function(df_plot, metric) {
-
-  df_plot = dplyr::filter(df_plot, metric == !!metric) 
-  
-  # Calculate regression lines for each model and metric combination
-  suppressWarnings({
-    regression_lines = df_plot %>%
-      group_by(model) %>%
-      group_modify( ~ asym_regression(.x$obs, .x$value))
-  })
-  
-  # Create base plot
-  plot <- plot_ly() %>%
-    layout(
-      title = "Model Performance vs. Number of observations",
-      xaxis = list(title = "Number of observations", type = "log"),
-      yaxis = list(title = metric),
-      legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot
-      margin = list(r = 150), # Add right margin to accommodate legend
-      hovermode = 'closest'
-    )
-  
-  # Points
-  for (model_name in unique(df_plot$model)) {
-    plot = plot %>%
-      add_markers(
-        data = filter(df_plot, model == model_name, metric %in% focal_metrics),
-        x = ~ obs,
-        y = ~ value,
-        color = model_name, # Set color to match legendgroup
-        legendgroup = model_name,
-        opacity = 0.6,
-        name = ~ model,
-        hoverinfo = 'text',
-        text = ~ paste(
-          "Species:", species, "<br>Observations:", obs, "<br>Value:", round(value, 3)
-        )
-      )
-  }
-  
-  # Add regression lines
-  for (model_name in unique(df_plot$model)) {
-    reg_data = dplyr::filter(regression_lines, model == model_name)
-    plot = plot %>%
-      add_lines(
-        data = reg_data,
-        x = ~ x,
-        y = ~ fit,
-        color = model_name, # Set color to match legendgroup
-        legendgroup = model_name,
-        name = paste(model_name, '(fit)'),
-        showlegend = FALSE
-      )
-  }
-  
-  return(plot)
-}
-
-df_plot = performance %>% dplyr::left_join(obs_count, by = "species") 
-  
-```
-
-::: panel-tabset
-#### AUC
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "auc")
-bslib::card(plot, full_screen = T)
-```
-
-#### F1
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "f1")
-bslib::card(plot, full_screen = T)
-```
-
-#### Cohen's kappa
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "kappa")
-bslib::card(plot, full_screen = T)
-```
-
-#### Accurracy
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "accuracy")
-bslib::card(plot, full_screen = T)
-```
-
-#### Precision
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "precision")
-bslib::card(plot, full_screen = T)
-```
-
-#### Recall
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "recall")
-bslib::card(plot, full_screen = T)
-```
-:::
-
-### Range characteristics
-
-#### Range size
-
-Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016).
-
--   Model performance tended to be slightly higher for species with larger range size
--   Only RF shows continuous performance improvements beyond range sizes of \~5M km²
-
-```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE, eval=F}
-
-df_join = range_maps %>% 
-  dplyr::mutate(range_size = as.numeric(st_area(range_maps) / 1000000)) %>%  # range in sqkm
-  sf::st_drop_geometry()
-
-df_plot = performance %>% 
-  inner_join(df_join, by = c("species" = "name_matched"))
-
-# Calculate regression lines for each model and metric combination
-suppressWarnings({
-  regression_lines = df_plot %>%
-    group_by(model, metric) %>%
-    group_modify(~asym_regression(.x$range_size, .x$value))
-})
-
-# Create base plot
-plot <- plot_ly() %>% 
-  layout(
-    title = "Model Performance vs. Range size",
-    xaxis = list(title = "Range size"),
-    yaxis = list(title = "Value"),
-    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
-    margin = list(r = 150),  # Add right margin to accommodate legend
-    hovermode = 'closest',
-    updatemenus = list(
-      list(
-        type = "dropdown",
-        active = 0,
-        buttons = plotly_buttons
-      )
-    )
-  )
-
-# Add regression lines and confidence intervals for each model
-for (model_name in unique(df_plot$model)) {
-  plot <- plot %>%
-    add_markers(
-      data = filter(df_plot, model == model_name),
-      x = ~range_size,
-      y = ~value,
-      color = model_name,  # Set color to match legendgroup
-      legendgroup = model_name,
-      opacity = 0.6,
-      name = ~model,
-      hoverinfo = 'text',
-      text = ~paste("Species:", species, "<br>Range size:", range_size, "<br>Value:", round(value, 3)),
-      transforms = list(
-        list(
-          type = 'filter',
-          target = ~metric,
-          operation = '=',
-          value = focal_metrics[1]
-        )
-      )
-    )
-}
-
-for (model_name in unique(df_plot$model)) {
-  reg_data = dplyr::filter(regression_lines, model == model_name)
-  plot = plot %>% 
-    add_lines(
-      data = reg_data,
-      x = ~x,
-      y = ~fit,
-      color = model_name,  # Set color to match legendgroup
-      legendgroup = model_name,
-      name = paste(model_name, '(fit)'),
-      showlegend = FALSE,
-      transforms = list(
-        list(
-          type = 'filter',
-          target = ~metric,
-          operation = '=',
-          value = focal_metrics[1]
-        )
-      )
-    )
-}
-
-bslib::card(plot, full_screen = T)
-```
-
-#### Range coverage
-
-Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells.
-
-$$
-RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}}
-$$
-
--   Models for species with higher range coverage showed slightly better performance
-
-```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE, eval=F}
-df_cells_total = range_maps_gridded %>%
-  dplyr::rename("species" = name_matched) %>% 
-  group_by(species) %>%
-  summarise(cells_total = n()) %>% 
-  st_drop_geometry()
-
-df_cells_occ <- range_maps_gridded %>%
-  st_join(occs_final, join = st_intersects) %>%
-  filter(name_matched == species) %>%         # Filter only intersections of the same species
-  group_by(species) %>%
-  summarise(cells_occupied = n_distinct(geometry)) %>%
-  st_drop_geometry()
-
-df_join = df_cells_total %>% 
-  dplyr::inner_join(df_cells_occ, by = "species") %>% 
-  dplyr::mutate(range_cov = cells_occupied / cells_total) %>% 
-  dplyr::select(species, range_cov)
-
-df_plot = performance %>% 
-  inner_join(df_join, by = "species")
-
-# Calculate regression lines for each model and metric combination
-suppressWarnings({
-  regression_lines = df_plot %>%
-    group_by(model, metric) %>%
-    group_modify(~lin_regression(.x$range_cov, .x$value))
-})
-
-# Create base plot
-plot <- plot_ly() %>% 
-  layout(
-    title = "Model Performance vs. Range coverage",
-    xaxis = list(title = "Range coverage"),
-    yaxis = list(title = "Value"),
-    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
-    margin = list(r = 150),  # Add right margin to accommodate legend
-    hovermode = 'closest',
-    updatemenus = list(
-      list(
-        type = "dropdown",
-        active = 0,
-        buttons = plotly_buttons
-      )
-    )
-  )
-
-# Add regression lines and confidence intervals for each model
-for (model_name in unique(df_plot$model)) {
-  plot <- plot %>%
-    add_markers(
-      data = filter(df_plot, model == model_name),
-      x = ~range_cov,
-      y = ~value,
-      color = model_name,  # Set color to match legendgroup
-      legendgroup = model_name,
-      opacity = 0.6,
-      name = ~model,
-      hoverinfo = 'text',
-      text = ~paste("Species:", species, "<br>Range coverage:", range_cov, "<br>Value:", round(value, 3)),
-      transforms = list(
-        list(
-          type = 'filter',
-          target = ~metric,
-          operation = '=',
-          value = focal_metrics[1]
-        )
-      )
-    )
-}
-
-for (model_name in unique(df_plot$model)) {
-  reg_data = dplyr::filter(regression_lines, model == model_name)
-  plot = plot %>% 
-    add_lines(
-      data = reg_data,
-      x = ~x,
-      y = ~fit,
-      color = model_name,  # Set color to match legendgroup
-      legendgroup = model_name,
-      name = paste(model_name, '(fit)'),
-      showlegend = FALSE,
-      transforms = list(
-        list(
-          type = 'filter',
-          target = ~metric,
-          operation = '=',
-          value = focal_metrics[1]
-        )
-      )
-    )
-}
-
-bslib::card(plot, full_screen = T)
-```
-
-#### Range coverage bias
-
-Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range.
-
-$$
-RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)}
-$$
-
-Higher bias values indicate that occurrence records are spatially more clustered within the range of the species.
-
--   There was no strong relationship between range coverage bias and model performance
-
-```{r range_coverage_bias, echo = FALSE, message=FALSE, warnings=FALSE, eval=F}
-df_occs_total = occs_final %>% 
-  st_drop_geometry() %>% 
-  group_by(species) %>% 
-  summarise(occs_total = n())
-
-df_join = df_occs_total %>% 
-  dplyr::inner_join(df_cells_total, by = "species") %>% 
-  dplyr::inner_join(df_cells_occ, by = "species") %>% 
-  dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1)))
-
-df_plot = performance %>% 
-  inner_join(df_join, by = "species")
-
-# Calculate regression lines for each model and metric combination
-suppressWarnings({
-  regression_lines = df_plot %>%
-    group_by(model, metric) %>%
-    group_modify(~lin_regression(.x$range_bias, .x$value))
-})
-
-# Create base plot
-plot <- plot_ly() %>% 
-  layout(
-    title = "Model Performance vs. Range coverage bias",
-    xaxis = list(title = "Range coverage bias"),
-    yaxis = list(title = "Value"),
-    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
-    margin = list(r = 150),  # Add right margin to accommodate legend
-    hovermode = 'closest',
-    updatemenus = list(
-      list(
-        type = "dropdown",
-        active = 0,
-        buttons = plotly_buttons
-      )
-    )
-  )
-
-# Add regression lines and confidence intervals for each model
-for (model_name in unique(df_plot$model)) {
-  plot <- plot %>%
-    add_markers(
-      data = filter(df_plot, model == model_name),
-      x = ~range_bias,
-      y = ~value,
-      color = model_name,  # Set color to match legendgroup
-      legendgroup = model_name,
-      opacity = 0.6,
-      name = ~model,
-      hoverinfo = 'text',
-      text = ~paste("Species:", species, "<br>Range coverage bias:", range_bias, "<br>Value:", round(value, 3)),
-      transforms = list(
-        list(
-          type = 'filter',
-          target = ~metric,
-          operation = '=',
-          value = focal_metrics[1]
-        )
-      )
-    )
-}
-
-for (model_name in unique(df_plot$model)) {
-  reg_data = dplyr::filter(regression_lines, model == model_name)
-  plot = plot %>% 
-    add_lines(
-      data = reg_data,
-      x = ~x,
-      y = ~fit,
-      color = model_name,  # Set color to match legendgroup
-      legendgroup = model_name,
-      name = paste(model_name, '(fit)'),
-      showlegend = FALSE,
-      transforms = list(
-        list(
-          type = 'filter',
-          target = ~metric,
-          operation = '=',
-          value = focal_metrics[1]
-        )
-      )
-    )
-}
-
-
-bslib::card(plot, full_screen = T)
-```
-
-### Functional group
-
-Functional groups were assigned based on taxonomic order. The following groupings were used:
-
-| Functional group      | Taxomic orders                                                        |
-|--------------------|----------------------------------------------------|
-| large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla                    |
-| small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha |
-| arboreal              | Primates, Pilosa                                                      |
-| flying                | Chiroptera                                                            |
-
--   Models for bats tended to perform slightly worse than for other groups.
-
-```{r functional_groups, echo = FALSE, message=FALSE, warnings=FALSE, eval=F}
-df_plot = performance %>% 
-  dplyr::left_join(functional_groups, by = c("species" = "name_matched"))
-
-plot <- plot_ly(
-  data = df_plot,
-  x = ~functional_group,
-  y = ~value,
-  color = ~model,
-  type = 'box',
-  boxpoints = "all",
-  jitter = 1,
-  pointpos = 0,
-  hoverinfo = 'text',
-  text = ~paste("Species:", species, "<br>Functional group:", functional_group, "<br>Value:", round(value, 3)),
-  transforms = list(
-    list(
-      type = 'filter',
-      target = ~metric,
-      operation = '=',
-      value = focal_metrics[1]  # default value
-    )
-  )
-)
-
-plot <- plot %>%
-  layout(
-    title = "Model Performance vs. Functional Group",
-    xaxis = list(title = "Functional group"),
-    yaxis = list(title = "Value"),
-    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
-    margin = list(r = 150),  # Add right margin to accommodate legend
-    hovermode = 'closest',
-    boxmode = "group",
-    updatemenus = list(
-      list(
-        type = "dropdown",
-        active = 0,
-        buttons = plotly_buttons
-      )
-    )
-  )
-
-bslib::card(plot, full_screen = T)
-```
diff --git a/R/05_01_performance_analysis_carsten.qmd b/R/05_01_performance_analysis_carsten.qmd
deleted file mode 100644
index 7dee2c4..0000000
--- a/R/05_01_performance_analysis_carsten.qmd
+++ /dev/null
@@ -1,190 +0,0 @@
----
-title: "SDM Performance analysis"
-format: html
-editor: visual
-engine: knitr
----
-
-```{r init, echo = FALSE, include = FALSE}
-library(tidyverse)
-library(sf)
-library(plotly)
-library(DT)
-
-load("../data/r_objects/model_data_pa_sampling.RData")
-load("../data/r_objects/ssdm_results.RData")
-load("../data/r_objects/msdm_results_embedding_raw.RData")
-
-load("../data/r_objects/range_maps.RData")
-load("../data/r_objects/range_maps_gridded.RData")
-load("../data/r_objects/occs_final.RData")
-load("../data/r_objects/functional_groups.RData")
-sf::sf_use_s2(use_s2 = FALSE)
-```
-
-```{r globals, echo = FALSE, include = FALSE}
-
-
-# Count occs per species
-obs_count = model_data %>% 
-  sf::st_drop_geometry() %>% 
-  dplyr::filter(present == 1) %>% 
-  dplyr::group_by(species) %>% 
-  dplyr::summarise(obs = n())
-
-
-# Regression functions
-asym_regression = function(x, y){
-  nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1))
-  new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100))
-  data.frame(
-    x = new_x,
-    fit = predict(nls_fit, newdata = data.frame(x = new_x))
-  )
-}
-
-lin_regression = function(x, y, family = "binomial"){
-  glm_fit = suppressWarnings(glm(y~x, family = family))
-  new_x = seq(min(x), max(x), length.out = 100)
-  data.frame(
-    x = new_x,
-    fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response")
-  )
-}
-
-msdm_results = msdm_results_embedding_raw %>% 
-  pivot_longer(all_of(c("auc", "accuracy", "kappa", "precision", "recall", "f1")), names_to = "metric", values_to = "value") %>% 
-  dplyr::select(-obs) %>% 
-  dplyr::mutate(
-    fold_eval = 1
-  ) %>% 
-  drop_na()
-
-# Performance table
-performance = ssdm_results %>% 
-  dplyr::select(-obs) %>% 
-  dplyr::filter(fold_eval == 1, species %in% msdm_results$species) %>%  # Only look at first fold
-  bind_rows(msdm_results) %>% 
-  ungroup() %>% 
-  dplyr::mutate(
-    value = case_when(
-      ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accurracy", "precision", "recall")) ~ 0.5,
-      ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0,
-      .default = value
-    )
-  )
-
-focal_metrics = unique(performance$metric)
-```
-
-
-## Analysis
-
-### Number of records
-
-```{r number_of_records, echo = FALSE, message=FALSE, warnings=FALSE}
-plot_performance_over_frequency = function(df_plot, metric) {
-
-  df_plot = dplyr::filter(df_plot, metric == !!metric) 
-  
-  # Calculate regression lines for each model and metric combination
-  suppressWarnings({
-    regression_lines = df_plot %>%
-      group_by(model) %>%
-      group_modify( ~ asym_regression(.x$obs, .x$value))
-  })
-  
-  # Create base plot
-  plot <- plot_ly() %>%
-    layout(
-      title = "Model Performance vs. Number of observations",
-      xaxis = list(title = "Number of observations", type = "log"),
-      yaxis = list(title = metric),
-      legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot
-      margin = list(r = 150), # Add right margin to accommodate legend
-      hovermode = 'closest'
-    )
-  
-  # Points
-  for (model_name in unique(df_plot$model)) {
-    plot = plot %>%
-      add_markers(
-        data = filter(df_plot, model == model_name, metric %in% focal_metrics),
-        x = ~ obs,
-        y = ~ value,
-        color = model_name, # Set color to match legendgroup
-        legendgroup = model_name,
-        opacity = 0.6,
-        name = ~ model,
-        hoverinfo = 'text',
-        text = ~ paste(
-          "Species:", species, "<br>Observations:", obs, "<br>Value:", round(value, 3)
-        )
-      )
-  }
-  
-  # Add regression lines
-  for (model_name in unique(df_plot$model)) {
-    reg_data = dplyr::filter(regression_lines, model == model_name)
-    plot = plot %>%
-      add_lines(
-        data = reg_data,
-        x = ~ x,
-        y = ~ fit,
-        color = model_name, # Set color to match legendgroup
-        legendgroup = model_name,
-        name = paste(model_name, '(fit)'),
-        showlegend = FALSE
-      )
-  }
-  
-  return(plot)
-}
-
-df_plot = performance %>% dplyr::left_join(obs_count, by = "species") 
-  
-```
-
-::: panel-tabset
-#### AUC
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "auc")
-bslib::card(plot, full_screen = T)
-```
-
-#### F1
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "f1")
-bslib::card(plot, full_screen = T)
-```
-
-#### Cohen's kappa
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "kappa")
-bslib::card(plot, full_screen = T)
-```
-
-#### Accurracy
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "accuracy")
-bslib::card(plot, full_screen = T)
-```
-
-#### Precision
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "precision")
-bslib::card(plot, full_screen = T)
-```
-
-#### Recall
-
-```{r echo = FALSE}
-plot = plot_performance_over_frequency(df_plot, metric = "recall")
-bslib::card(plot, full_screen = T)
-```
-:::
diff --git a/R/05_01_performance_report.qmd b/R/05_01_performance_report.qmd
new file mode 100644
index 0000000..03ed7d3
--- /dev/null
+++ b/R/05_01_performance_report.qmd
@@ -0,0 +1,375 @@
+---
+title: "SDM Performance analysis"
+format: html
+editor: visual
+engine: knitr
+---
+
+```{r init, echo = FALSE, include = FALSE}
+library(tidyverse)
+library(sf)
+library(plotly)
+library(DT)
+
+load("../data/r_objects/model_data.RData")
+load("../data/r_objects/ssdm_results.RData")
+load("../data/r_objects/msdm_embed_performance.RData")
+
+load("../data/r_objects/range_maps.RData")
+load("../data/r_objects/range_maps_gridded.RData")
+load("../data/r_objects/occs_final.RData")
+load("../data/r_objects/functional_groups.RData")
+
+sf::sf_use_s2(use_s2 = FALSE)
+```
+
+```{r globals, echo = FALSE, include = FALSE}
+# Count occs per species
+obs_count = model_data %>% 
+  sf::st_drop_geometry() %>% 
+  dplyr::filter(present == 1) %>% 
+  dplyr::group_by(species) %>% 
+  dplyr::summarise(obs = n())
+
+
+# Regression functions
+asym_regression = function(x, y){
+  nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1))
+  new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100))
+  data.frame(
+    x = new_x,
+    fit = predict(nls_fit, newdata = data.frame(x = new_x))
+  )
+}
+
+lin_regression = function(x, y, family = "binomial"){
+  glm_fit = suppressWarnings(glm(y~x, family = family))
+  new_x = seq(min(x), max(x), length.out = 100)
+  data.frame(
+    x = new_x,
+    fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response")
+  )
+}
+
+# Performance table
+performance = ssdm_results %>% 
+  bind_rows(msdm_embed_performance) %>% 
+  dplyr::group_by(species, model, metric) %>% 
+  dplyr::summarise(value = mean(value, na.rm = F)) %>% 
+  dplyr::mutate(
+    metric = stringr::str_to_lower(metric),
+    value = case_when(
+      ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5,
+      ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0,
+      .default = value
+    )
+  )
+
+
+focal_metrics = unique(performance$metric)
+
+plot_performance = function(df_plot, metric, x_var, x_label, x_log = T, reg_func = lin_regression) {
+  df_plot = dplyr::filter(df_plot, metric == !!metric) %>% 
+    dplyr::rename(x_var = !!x_var)
+  
+  # Calculate regression lines for each model and metric combination
+  suppressWarnings({
+    regression_lines = df_plot %>%
+      group_by(model) %>%
+      group_modify( ~ reg_func(.x$x_var, .x$value))
+  })
+  
+  # Create base plot
+  plot <- plot_ly() %>%
+    layout(
+      title = paste0("Model Performance vs. ", x_label),
+      xaxis = list(title = x_label, type = ifelse(x_log, "log", "linear")),
+      yaxis = list(title = metric),
+      legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot
+      margin = list(r = 150), # Add right margin to accommodate legend
+      hovermode = 'closest'
+    )
+  
+  # Points
+  for (model_name in unique(df_plot$model)) {
+    plot = plot %>%
+      add_markers(
+        data = filter(df_plot, model == model_name, metric %in% focal_metrics),
+        x = ~ x_var,
+        y = ~ value,
+        color = model_name, # Set color to match legendgroup
+        legendgroup = model_name,
+        opacity = 0.6,
+        name = ~ model,
+        hoverinfo = 'text',
+        text = ~ paste(
+          "Species:", species, "<br>", x_label, ":", x_var, "<br>Value:", round(value, 3)
+        )
+      )
+  }
+  
+  # Add regression lines
+  for (model_name in unique(df_plot$model)) {
+    reg_data = dplyr::filter(regression_lines, model == model_name)
+    plot = plot %>%
+      add_lines(
+        data = reg_data,
+        x = ~ x,
+        y = ~ fit,
+        color = model_name, # Set color to match legendgroup
+        legendgroup = model_name,
+        name = paste(model_name, '(fit)'),
+        showlegend = FALSE
+      )
+  }
+  
+  return(plot)
+}
+```
+
+## Summary
+
+This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest.
+
+Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling).
+
+### Modeling overview:
+
+#### General decisions
+
+-   Balanced presences and absences for each species
+-   Absence sampling from probability surface that balances geographical sampling bias and environmental preferences per species
+    -   higher probability in areas that have been sampled more intensely
+    -   lower probability in areas with environmental conditions similar to presence locations
+
+![absence sampling sampling](images/Akodon%20boliviensis.pdf){width="100%" height="800"}
+
+-   Eight predictors:
+    -   Min Temperature of Coldest Month (bio6)
+    -   Precipitation of Driest Quarter (bio17)
+    -   Climate Moisture Index (cmi)
+    -   Surface Downwelling Shortwave Flux (rsds)
+    -   Tree canopy cover (igfc)
+    -   Distance to freshwater (dtfw)
+    -   Seasonal inundation(igsw)
+    -   Topographic roughness (roughness)
+-   Nested cross validation
+    -   Outer (performance evaluation): Spatially blocked 5-fold cross validation for all models
+    -   Inner (model training): Random split (1/5) in cito, Spatially blocked 5-fold cross validation in other models
+
+#### sSDM Algorithms
+
+-   Four Algorithms: Random Forest (RF), Gradient Boosting Machine (GBM), Generalized Additive Model (GAM), Neural Network (NN)
+
+-   NN: Manual hyperparameter tuning, same settings across species
+
+-   RF + GBM + GAM: Automated hyperparameter tuning (4 random combinations) per species
+
+#### mSDM Algorithms
+
+Multispecies Neural Network with species embedding (**MSDM_embed**)
+
+-   prediction: probability of occurrence given a set of (environmental) inputs and species identity
+-   embedding initialized at random
+-   three hidden layers, sigmoid + leaky ReLu activations, binomial loss
+
+### Key findings:
+
+TBA
+
+## Analysis
+
+The table below shows the analysed modeling results.
+
+```{r performance, echo = FALSE, message=FALSE, warnings=FALSE}
+DT::datatable(performance) %>% 
+  formatRound(columns="value", digits=3)
+```
+
+### Number of records
+
+-   Model performance was generally better for species with more observations
+-   Very poor performance below 50-100 observations
+
+```{r, echo = FALSE, message=FALSE, warnings=FALSE}
+df_plot = performance %>% 
+  dplyr::left_join(obs_count, by = "species") 
+```
+
+::: panel-tabset
+#### AUC
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "auc", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+
+#### F1
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "f1", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+
+#### Cohen's kappa
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "kappa", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+
+#### Accuracy
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "accuracy", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+
+#### Precision
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "precision", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+
+#### Recall
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "recall", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+:::
+
+### Range size
+
+Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016).
+
+```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE}
+df_join = range_maps %>% 
+  dplyr::mutate(range_size = as.numeric(sf::st_area(range_maps) / 1000000)) %>%  # range in sqkm
+  sf::st_drop_geometry()
+
+df_plot = performance %>% 
+  inner_join(df_join, by = c("species" = "name_matched"))
+```
+
+::: panel-tabset
+#### AUC
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "auc", x_var = "range_size", x_label = "Range size")
+bslib::card(plot, full_screen = T)
+```
+
+#### F1
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "f1", x_var = "range_size", x_label = "Range size")
+bslib::card(plot, full_screen = T)
+```
+
+#### Cohen's kappa
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "kappa", x_var = "range_size", x_label = "Range size", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+
+#### Accuracy
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_size", x_label = "Range size")
+bslib::card(plot, full_screen = T)
+```
+
+#### Precision
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "precision", x_var = "range_size", x_label = "Range size")
+bslib::card(plot, full_screen = T)
+```
+
+#### Recall
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "recall", x_var = "range_size", x_label = "Range size")
+bslib::card(plot, full_screen = T)
+```
+:::
+
+### Range coverage
+
+Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells.
+
+$$
+RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}}
+$$
+
+```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE}
+occs_final_unproj = occs_final %>% sf::st_transform("EPSG:4326")
+
+df_cells_total = range_maps_gridded %>%
+  dplyr::rename("species" = name_matched) %>% 
+  group_by(species) %>%
+  summarise(cells_total = n()) %>% 
+  st_drop_geometry()
+
+df_cells_occ <- range_maps_gridded %>%
+  st_join(occs_final_unproj, join = st_intersects) %>%
+  filter(name_matched == species) %>%         # Filter only intersections of the same species
+  group_by(species) %>%
+  summarise(cells_occupied = n_distinct(geometry)) %>%
+  st_drop_geometry()
+
+df_join = df_cells_total %>% 
+  dplyr::inner_join(df_cells_occ, by = "species") %>% 
+  dplyr::mutate(range_cov = cells_occupied / cells_total) %>% 
+  dplyr::select(species, range_cov)
+
+df_plot = performance %>% 
+  inner_join(df_join, by = "species")
+```
+
+::: panel-tabset
+#### AUC
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "auc", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+
+#### F1
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "f1", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+
+#### Cohen's kappa
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "kappa", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+
+#### Accuracy
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+
+#### Precision
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "precision", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+
+#### Recall
+
+```{r echo = FALSE}
+plot = plot_performance(df_plot, metric = "recall", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
+bslib::card(plot, full_screen = T)
+```
+:::
diff --git a/R/05_02_MSDM_comparison.qmd b/R/05_02_MSDM_comparison.qmd
deleted file mode 100644
index b70af79..0000000
--- a/R/05_02_MSDM_comparison.qmd
+++ /dev/null
@@ -1,779 +0,0 @@
----
-title: "mSDM comparison"
-format: html
-editor: visual
-engine: knitr
----
-
-```{r init, echo = FALSE, include = FALSE}
-library(tidyverse)
-library(sf)
-library(plotly)
-library(DT)
-library(shiny)
-
-load("../data/r_objects/simple_cv/msdm_results_embedding_raw.RData")
-load("../data/r_objects/simple_cv/msdm_results_embedding_traits_static.RData")
-load("../data/r_objects/simple_cv/msdm_results_embedding_traits_trained.RData")
-load("../data/r_objects/simple_cv/msdm_results_embedding_phylo_static.RData")
-load("../data/r_objects/simple_cv/msdm_results_embedding_phylo_trained.RData")
-load("../data/r_objects/simple_cv/msdm_results_embedding_range_static.RData")
-load("../data/r_objects/simple_cv/msdm_results_embedding_range_trained.RData")
-load("../data/r_objects/simple_cv/msdm_results_embedding_multi_nolonlat.RData")
-load("../data/r_objects/simple_cv/msdm_results_embedding_multi_lonlat.RData")
-
-sf::sf_use_s2(use_s2 = FALSE)
-```
-
-```{r globals, echo = FALSE, include = FALSE}
-# Select metrics
-focal_metrics = c("auc", "f1", "accuracy")  # There's a weird bug in plotly that scrambles up lines when using more than three groups
-
-# Prepare final dataframes
-results_embedding_raw = msdm_results_embedding_raw %>% 
-  dplyr::mutate(
-    embedding = "raw",
-    training = "trained"
-  )
-
-results_embedding_informed = c(
-  "msdm_results_embedding_phylo_static",
-  "msdm_results_embedding_phylo_trained",
-  "msdm_results_embedding_range_static",
-  "msdm_results_embedding_range_trained",
-  "msdm_results_embedding_traits_static",
-  "msdm_results_embedding_traits_trained",
-  "msdm_results_embedding_multi_nolonlat",
-  "msdm_results_embedding_multi_lonlat"
-)
-
-results_embedding_informed_merged = lapply(results_embedding_informed, function(df_name){
-  df = get(df_name)
-  name_split = str_split_1(df_name, pattern = "_")
-  df$embedding = name_split[4]
-  df$training = name_split[5]
-  assign(df_name, df)
-}) %>% bind_rows()
-
-results_final = results_embedding_raw %>% 
-  bind_rows(results_embedding_informed_merged) %>% 
-  drop_na() %>% 
-  mutate(
-    model = recode(
-      model,
-      "MSDM_embed" = "M_STD",
-      "MSDM_embed_informed_phylo_static" = "M_PS",
-      "MSDM_embed_informed_phylo_trained" = "M_PT",
-      "MSDM_embed_informed_traits_static" = "M_TS",
-      "MSDM_embed_informed_traits_trained" = "M_TT",
-      "MSDM_embed_informed_range_static" = "M_RS",
-      "MSDM_embed_informed_range_trained" = "M_RT",
-      "MSDM_embed_informed_multi_nolonlat" = "M_MT_nolonlat",
-      "MSDM_embed_informed_multi_lonlat" = "M_MT_lonlat"
-    ),
-    across(all_of(focal_metrics), round, 3)
-  )
-
-results_final_long = results_final %>% 
-  tidyr::pivot_longer(focal_metrics, names_to = "metric", values_to = "value")
-
-delta_performance = results_final %>% 
-  dplyr::select(all_of(c("species", "model", focal_metrics))) %>% 
-  tidyr::pivot_wider(id_cols = species, names_from = model, values_from = c(auc, accuracy, f1)) %>% 
-  dplyr::mutate(
-    across(starts_with("auc_M_"), ~ . - auc_M_STD),
-    across(starts_with("accuracy_M_"), ~ . - accuracy_M_STD),
-    across(starts_with("f1_M_"), ~ . - f1_M_STD)
-  ) %>% 
-  dplyr::select(-auc_M_STD, -accuracy_M_STD, -f1_M_STD) %>% 
-  pivot_longer(-species, names_to = c("metric", "model"), names_pattern = "(^[a-zA-Z1-9]+)_(.+$)", values_to = "delta")
-
-# Regression functions
-asym_regression = function(x, y){
-  nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1))
-  new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100))
-  data.frame(
-    x = new_x,
-    fit = predict(nls_fit, newdata = data.frame(x = new_x))
-  )
-}
-
-lin_regression = function(x, y){
-  glm_fit = suppressWarnings(glm(y~x, family = "binomial"))
-  new_x = seq(min(x), max(x), length.out = 100)
-  data.frame(
-    x = new_x,
-    fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response")
-  )
-}
-```
-
-```{r create_summaries, echo = FALSE}
-auc_overview = results_final %>% 
-  pivot_wider(names_from = model, values_from = auc, id_cols = c(species, obs)) %>% 
-  dplyr::arrange(obs)
-
-accuracy_overview = results_final %>% 
-  pivot_wider(names_from = model, values_from = accuracy, id_cols = c(species, obs)) %>% 
-  dplyr::arrange(obs)
-
-f1_overview = results_final %>% 
-  pivot_wider(names_from = model, values_from = f1, id_cols = c(species, obs)) %>% 
-  dplyr::arrange(obs)
-```
-
-## *Model overview*
-
-### *Performance summary*
-
-```{r echo = FALSE, message=FALSE, warnings=FALSE}
-p_summary = results_final_long %>% 
-  group_by(model, metric) %>% 
-  dplyr::summarize(value = round(mean(value, na.rm = T), 3)) %>% 
-  pivot_wider(names_from = model, values_from = value) %>% 
-  select(metric, M_STD, M_TS, M_PS, M_RS, M_TT, M_PT, M_RT)
-
-datatable(p_summary)
-```
-
-Exact results for different performance metrics across all models are shown below.
-
-::: panel-tabset
-### *AUC*
-
-```{r echo = FALSE}
-datatable(
-  auc_overview, 
-  options = list(
-    pageLength = 10,
-    initComplete = htmlwidgets::JS(
-      "function(settings, json) {
-         $(this.api().table().container()).css({'font-size': '10pt'});
-       }"
-    ),
-    autoWidth = TRUE,
-    columnDefs = list(list(width = "250px", targets = 1))
-  )
-)
-```
-
-### *Accuracy*
-
-```{r echo = FALSE}
-datatable(
-  accuracy_overview, 
-  options = list(
-    pageLength = 10,
-    initComplete = htmlwidgets::JS(
-      "function(settings, json) {
-         $(this.api().table().container()).css({'font-size': '10pt'});
-       }"
-    ),
-    autoWidth = TRUE,
-    columnDefs = list(list(width = "250px", targets = 1))
-  )
-)
-```
-
-### *F1 score*
-
-```{r echo = FALSE}
-datatable(
-  f1_overview, 
-  options = list(
-    pageLength = 10,
-    initComplete = htmlwidgets::JS(
-      "function(settings, json) {
-         $(this.api().table().container()).css({'font-size': '10pt'});
-       }"
-    ),
-    autoWidth = TRUE,
-    columnDefs = list(list(width = "250px", targets = 1))
-  )
-)
-```
-:::
-
-## *Number of records*
-
-```{r performance_vs_occurrences, echo = FALSE, message=FALSE, warnings=FALSE}
-# Dropdown options
-plotly_buttons = list()
-for(metric in focal_metrics){
-  plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", metric), label = metric)
-}
-
-df_plot = results_final_long
-
-# Calculate regression lines for each model and metric combination
-suppressWarnings({
-  regression_lines = df_plot %>%
-    group_by(model, metric) %>%
-    group_modify(~asym_regression(.x$obs, .x$value))
-})
-
-# Create base plot
-plot <- plot_ly() %>% 
-  layout(
-    title = "Model Performance vs. Number of observations",
-    xaxis = list(title = "Number of observations", type = "log"),
-    yaxis = list(title = "Value"),
-    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
-    margin = list(r = 150),  # Add right margin to accommodate legend
-    hovermode = 'closest',
-    updatemenus = list(
-      list(
-        type = "dropdown",
-        active = 0,
-        buttons = plotly_buttons
-      )
-    )
-  )
-
-# Points
-for (model_name in unique(df_plot$model)) {
-  plot = plot %>%
-    add_markers(
-      data = filter(df_plot, model == model_name),
-      x = ~obs,
-      y = ~value,
-      color = model_name,  # Set color to match legendgroup
-      legendgroup = model_name,
-      opacity = 0.6,
-      name = ~model,
-      hoverinfo = 'text',
-      text = ~paste("Species:", species, "<br>Observations:", obs, "<br>Value:", round(value, 3)),
-      transforms = list(
-        list(
-          type = 'filter',
-          target = ~metric,
-          operation = '=',
-          value = focal_metrics[1]
-        )
-      )
-    )
-}
-
-# Add regression lines
-for(model_name in unique(df_plot$model)){
-  reg_data = dplyr::filter(regression_lines, model == model_name)
-  plot = plot %>%
-    add_lines(
-      data = reg_data,
-      x = ~x,
-      y = ~fit,
-      color = model_name,  # Set color to match legendgroup
-      legendgroup = model_name,
-      name = paste(model_name, '(fit)'),
-      showlegend = FALSE,
-      transforms = list(
-        list(
-          type = 'filter',
-          target = ~metric,
-          operation = '=',
-          value = focal_metrics[1]
-        )
-      )
-    )
-}
-
-bslib::card(plot, full_screen = T)
-```
-
-## *Relative Performance*
-
-```{r delta, echo = FALSE, message=FALSE, warnings=FALSE}
-results_ranked_obs = results_final_long %>% 
-  group_by(species,  metric) %>% 
-  mutate(rank = rank(value))
-
-reglines = results_ranked_obs %>%
-  group_by(model, metric) %>%
-  group_modify(~as.data.frame(loess.smooth(.x$obs, .x$rank)))
-
-
-# The table below summarizes the relative performance of the models across different observation frequency ranges. The `rank` column indicates the model's performance rank compared to all other models for a given combination of model and metric. The subsequent columns, `(1,10]`, `(10,25]`, ..., `(5000, Inf]`, represent bins of observation frequency. The values in these columns show how many times the model's performance was ranked at the specified `rank` within the respective frequency range.
-
-# freq_thresholds = c(1, 10, 25, 50, 100, 250, 500, 1000, 2500, 5000, Inf)
-# 
-# df_print = performance %>%
-#   mutate(freq_class = cut(obs, freq_thresholds, dig.lab = 5)) %>%
-#   group_by(species, metric, freq_class) %>%
-#   dplyr::mutate(rank = order(value, decreasing = T)) %>% 
-#   group_by(model, metric, rank, freq_class) %>% 
-#   tally() %>% 
-#   pivot_wider(names_from = freq_class, values_from = n) %>% 
-#   dplyr::select("model", "metric", "rank", "(1,10]", "(10,25]", 
-#                 "(25,50]", "(50,100]", "(100,250]", "(250,500]", 
-#                 "(500,1000]", "(1000,2500]", "(2500,5000]", "(5000,Inf]") %>% 
-#   dplyr::arrange(metric, rank, model) %>% 
-#   replace(is.na(.), 0)
-# 
-# DT::datatable(df_print)
-```
-
-::: panel-tabset
-### *AUC*
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(results_ranked_obs, metric == "auc")
-reglines_plot = dplyr::filter(reglines, metric == "auc")
-
-plot = plot_ly(
-  data = df_plot,
-  x = ~obs, 
-  y = ~rank, 
-  color = ~model,
-  type = 'scatter', 
-  mode = 'markers', 
-  opacity = 0.5
-) %>%
-  layout(
-    yaxis = list(title = "rank (lower is better)"),
-    xaxis = list(title = "number of observations", type = "log"), # log scale for x-axis
-    showlegend = TRUE
-  ) 
-
-
-for(model_name in unique(df_plot$model)){
-  reg_data = dplyr::filter(reglines_plot, model == model_name)
-  plot = plot %>% 
-    add_lines(
-      data = reg_data,
-      x = ~x,
-      y = ~y,
-      color = model_name,  # Set color to match legendgroup
-      legendgroup = model_name,
-      name = paste(model_name, '(smooth)'),
-      showlegend = FALSE,
-    )
-}
-
-bslib::card(plot, full_screen = T)
-```
-
-### *Accuracy*
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(results_ranked_obs, metric == "accuracy")
-reglines_plot = dplyr::filter(reglines, metric == "accuracy")
-
-plot = plot_ly(
-  data = df_plot,
-  x = ~obs, 
-  y = ~rank, 
-  color = ~model,
-  type = 'scatter', 
-  mode = 'markers', 
-  opacity = 0.5
-) %>%
-  layout(
-    yaxis = list(title = "rank (lower is better)"),
-    xaxis = list(title = "number of observations", type = "log"), # log scale for x-axis
-    showlegend = TRUE
-  ) 
-
-
-for(model_name in unique(df_plot$model)){
-  reg_data = dplyr::filter(reglines_plot, model == model_name)
-  plot = plot %>% 
-    add_lines(
-      data = reg_data,
-      x = ~x,
-      y = ~y,
-      color = model_name,  # Set color to match legendgroup
-      legendgroup = model_name,
-      name = paste(model_name, '(smooth)'),
-      showlegend = FALSE,
-    )
-}
-
-bslib::card(plot, full_screen = T)
-```
-
-### *F1 score*
-
-```{r echo = FALSE}
-df_plot = dplyr::filter(results_ranked_obs, metric == "f1")
-reglines_plot = dplyr::filter(reglines, metric == "f1")
-
-plot = plot_ly(
-  data = df_plot,
-  x = ~obs, 
-  y = ~rank, 
-  color = ~model,
-  type = 'scatter', 
-  mode = 'markers', 
-  opacity = 0.5
-) %>%
-  layout(
-    yaxis = list(title = "rank (lower is better)"),
-    xaxis = list(title = "number of observations", type = "log"), # log scale for x-axis
-    showlegend = TRUE
-  ) 
-
-
-for(model_name in unique(df_plot$model)){
-  reg_data = dplyr::filter(reglines_plot, model == model_name)
-  plot = plot %>% 
-    add_lines(
-      data = reg_data,
-      x = ~x,
-      y = ~y,
-      color = model_name,  # Set color to match legendgroup
-      legendgroup = model_name,
-      name = paste(model_name, '(smooth)'),
-      showlegend = FALSE,
-    )
-}
-
-bslib::card(plot, full_screen = T)
-```
-:::
-
-## *Trait space*
-
-```{r trait_pca, echo = FALSE, message=FALSE, warnings=FALSE}
-load("../data/r_objects/traits_proc.RData")
-
-# Preprocess traits for PCA
-traits_num = traits_proc %>% 
-  ungroup() %>% 
-  dplyr::mutate(temp_id = row_number()) %>%
-  pivot_wider(
-    names_from = ForStrat.Value,
-    values_from = temp_id,
-    values_fn = function(x) 1,
-    values_fill = 0,
-    names_prefix = "ForStrat."
-  ) %>% 
-  drop_na(species) %>% 
-  column_to_rownames(var = "species") %>% 
-  scale()
-
-# Run PCA
-traits_pca = prcomp(traits_num)
-species_vcts = traits_pca$x %>% 
-  as.data.frame() %>% 
-  rownames_to_column(var = "species")
-
-df_performance_vs_traits = dplyr::inner_join(delta_performance, species_vcts)
-```
-
-Functional traits for `r I(nrow(traits_proc))` species were used to construct a multidimensional trait space. Before ordination, categorical variables were converted into dummy variables, and all variables were scaled and centered. A Principal Component Analysis (PCA) was performed on the preprocessed dataset, and the first three axes are used to visualize species' positions within the trait space. For reference, the rotation vectors of the original traits are also plotted.
-
-::: panel-tabset
-#### *AUC*
-
-```{r auc_trait_space, echo = FALSE, message=FALSE, warnings=FALSE}
-# Create plot df
-df_plot = dplyr::filter(df_performance_vs_traits, metric == "auc", !is.na(delta)) %>% 
-  group_by(species) %>% 
-  mutate(
-    PC1 = PC1 + runif(1, -0.15, 0.15), # add jitter
-    PC2 = PC2 + runif(1, -0.15, 0.15)  # add jitter
-  ) 
-
-# Buttons
-plotly_buttons = list()
-for(model in c("M_PS", "M_PT", "M_TS", "M_TT", "M_RS", "M_RT")){
-  plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", model), label = model)
-}
-
-# Create plot
-plot <- plot_ly(
-  marker = list(           
-    color = ~delta,
-    colors = colorRamp(c("blue", "lightgrey", "red")),
-    cauto = F,
-    cmin = -1,
-    cmid = 0, 
-    cmax = 1,
-    opacity = 0.6,
-    colorbar = list(
-      title = "\u0394 AUC",
-      titleside = "right",  
-      x = 1.1,  # Adjust x position of colorbar
-      y = 0.5   # Adjust y position of colorbar
-    )
-  )) %>%
-  add_markers(
-    data = df_plot,
-    x = ~PC1,
-    y = ~PC2,
-    z = ~PC3,
-    hoverinfo = 'text',
-    text = ~paste("Species:", species, "<br>\u0394:", round(delta, 3)),
-    transforms = list(
-      list(
-        type = 'filter',
-        target = ~model,
-        operation = '=',
-        value = "M_TS"
-      )
-    )
-  ) %>% 
-  layout(
-    title = "Relative model performance (trait space)",
-    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
-    margin = list(r = 150),  # Add right margin to accommodate legend
-    hovermode = 'closest',
-    updatemenus = list(
-      list(
-        type = "dropdown",
-        active = 0,
-        buttons = plotly_buttons
-      )
-    )
-  ) 
-
-trait_loadings = traits_pca$rotation * traits_pca$sdev
-for (i in 1:nrow(trait_loadings)) {
-  plot <- plot %>%
-    add_trace(
-      x = c(0, trait_loadings[i, 1]),
-      y = c(0, trait_loadings[i, 2]),
-      z = c(0, trait_loadings[i, 3]),
-      type = "scatter3d",
-      mode = "lines+markers+text",
-      line = list(
-        color = "black",
-        width = 2,
-        arrow = list(
-          end = 1,
-          type = "open",
-          length = 0.1
-        )
-      ),
-      marker = list(
-        color = "black",
-        size = 1,
-        symbol="arrow"
-      ),
-      text = rownames(trait_loadings)[i],
-      textposition = "top right",
-      opacity = 1,
-      hoverinfo = "text",
-      text = rownames(trait_loadings)[i],
-      showlegend = FALSE
-    )
-}
-
-
-bslib::card(plot, full_screen = T)
-```
-
-#### *Accuracy*
-
-```{r accuracy_trait_space, echo = FALSE, message=FALSE, warnings=FALSE}
-# Create plot df
-df_plot = dplyr::filter(df_performance_vs_traits, metric == "accuracy", !is.na(delta)) %>% 
-  group_by(species) %>% 
-  mutate(     # add jitter
-    PC1 = PC1 + runif(1, -0.15, 0.15),
-    PC2 = PC2 + runif(1, -0.15, 0.15),
-    PC3 = PC3 + runif(1, -0.15, 0.15)
-  ) 
-
-# Buttons
-plotly_buttons = list()
-for(model in c("M_PS", "M_PT", "M_TS", "M_TT", "M_RS", "M_RT")){
-  plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", model), label = model)
-}
-
-# Create plot
-plot <- plot_ly(
-  marker = list(           
-    color = ~delta,
-    colors = colorRamp(c("blue", "lightgrey", "red")),
-    cauto = F,
-    cmin = -1,
-    cmid = 0, 
-    cmax = 1,
-    opacity = 0.6,
-    colorbar = list(
-      title = "\u0394 Accuracy",
-      titleside = "right",  
-      x = 1.1,  # Adjust x position of colorbar
-      y = 0.5   # Adjust y position of colorbar
-    )
-  )) %>%
-  add_markers(
-    data = df_plot,
-    x = ~PC1,
-    y = ~PC2,
-    z = ~PC3,
-    hoverinfo = 'text',
-    text = ~paste("Species:", species, "<br>\u0394:", round(delta, 3)),
-    transforms = list(
-      list(
-        type = 'filter',
-        target = ~model,
-        operation = '=',
-        value = "M_TS"
-      )
-    )
-  ) %>% 
-  layout(
-    title = "Relative model performance (trait space)",
-    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
-    margin = list(r = 150),  # Add right margin to accommodate legend
-    hovermode = 'closest',
-    updatemenus = list(
-      list(
-        type = "dropdown",
-        active = 0,
-        buttons = plotly_buttons
-      )
-    )
-  ) 
-
-trait_loadings = traits_pca$rotation * traits_pca$sdev
-for (i in 1:nrow(trait_loadings)) {
-  plot <- plot %>%
-    add_trace(
-      x = c(0, trait_loadings[i, 1]),
-      y = c(0, trait_loadings[i, 2]),
-      z = c(0, trait_loadings[i, 3]),
-      type = "scatter3d",
-      mode = "lines+markers+text",
-      line = list(
-        color = "black",
-        width = 2,
-        arrow = list(
-          end = 1,
-          type = "open",
-          length = 0.1
-        )
-      ),
-      marker = list(
-        color = "black",
-        size = 1,
-        symbol="arrow"
-      ),
-      text = rownames(trait_loadings)[i],
-      textposition = "top right",
-      opacity = 1,
-      hoverinfo = "text",
-      text = rownames(trait_loadings)[i],
-      showlegend = FALSE
-    )
-}
-
-
-bslib::card(plot, full_screen = T)
-```
-
-#### *F1*
-
-```{r f1_trait_space, echo = FALSE, message=FALSE, warnings=FALSE}
-# Create plot df
-df_plot = dplyr::filter(df_performance_vs_traits, metric == "f1", !is.na(delta)) %>% 
-  group_by(species) %>% 
-  mutate(
-    PC1 = PC1 + runif(1, -0.15, 0.15),    # add jitter
-    PC2 = PC2 + runif(1, -0.15, 0.15)     # add jitter 
-  ) 
-
-# Buttons
-plotly_buttons = list()
-for(model in c("M_PS", "M_PT", "M_TS", "M_TT", "M_RS", "M_RT")){
-  plotly_buttons[[length(plotly_buttons) + 1]] = list(method = "restyle", args = list("transforms[0].value", model), label = model)
-}
-
-# Create plot
-plot <- plot_ly(
-  marker = list(           
-    color = ~delta,
-    colors = colorRamp(c("blue", "lightgrey", "red")),
-    cauto = F,
-    cmin = -1,
-    cmid = 0, 
-    cmax = 1,
-    opacity = 0.6,
-    colorbar = list(
-      title = "\u0394 F1",
-      titleside = "right",  
-      x = 1.1,  # Adjust x position of colorbar
-      y = 0.5   # Adjust y position of colorbar
-    )
-  )) %>%
-  add_markers(
-    data = df_plot,
-    x = ~PC1,
-    y = ~PC2,
-    z = ~PC3,
-    hoverinfo = 'text',
-    text = ~paste("Species:", species, "<br>\u0394:", round(delta, 3)),
-    transforms = list(
-      list(
-        type = 'filter',
-        target = ~model,
-        operation = '=',
-        value = "M_TS"
-      )
-    )
-  ) %>% 
-  layout(
-    title = "Relative model performance (trait space)",
-    legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
-    margin = list(r = 150),  # Add right margin to accommodate legend
-    hovermode = 'closest',
-    updatemenus = list(
-      list(
-        type = "dropdown",
-        active = 0,
-        buttons = plotly_buttons
-      )
-    )
-  ) 
-
-trait_loadings = traits_pca$rotation * traits_pca$sdev
-for (i in 1:nrow(trait_loadings)) {
-  plot <- plot %>%
-    add_trace(
-      x = c(0, trait_loadings[i, 1]),
-      y = c(0, trait_loadings[i, 2]),
-      z = c(0, trait_loadings[i, 3]),
-      type = "scatter3d",
-      mode = "lines+markers+text",
-      line = list(
-        color = "black",
-        width = 2,
-        arrow = list(
-          end = 1,
-          type = "open",
-          length = 0.1
-        )
-      ),
-      marker = list(
-        color = "black",
-        size = 1,
-        symbol="arrow"
-      ),
-      text = rownames(trait_loadings)[i],
-      textposition = "top right",
-      opacity = 1,
-      hoverinfo = "text",
-      text = rownames(trait_loadings)[i],
-      showlegend = FALSE
-    )
-}
-
-bslib::card(plot, full_screen = T)
-```
-:::
-
-### *Taxonomy*
-
-```{r taxonomy, echo = FALSE, message=FALSE, warnings=FALSE}
-# load("../data/r_objects/functional_groups.RData")
-# 
-# df_plot = results_final %>% 
-#   dplyr::left_join(functional_groups, by = c("species" = "name_matched"))
-# 
-# plot = 
-#   
-# bslib::card(plot, full_screen = T)
-
-```
diff --git a/R/05_02_publication_analysis.R b/R/05_02_publication_analysis.R
new file mode 100644
index 0000000..babf048
--- /dev/null
+++ b/R/05_02_publication_analysis.R
@@ -0,0 +1,382 @@
+# General packages
+library(tidyverse)
+library(patchwork)
+
+# Geo packages
+library(terra)
+library(sf)
+library(geos)
+
+# Stats packages
+library(Rtsne)
+library(cito)
+
+source("R/utils.R")
+
+load("data/r_objects/model_data.RData")
+load("data/r_objects/ssdm_results.RData")
+load("data/r_objects/msdm_embed_performance.RData")
+load("data/r_objects/functional_groups.RData")
+
+load("data/r_objects/sa_polygon.RData")
+load("data/r_objects/range_maps.RData")
+
+sf::sf_use_s2(use_s2 = FALSE)
+
+model_data = model_data %>% 
+  dplyr::filter(!is.na(fold_eval)) %>% 
+  dplyr::mutate(species = as.factor(species))
+
+# ------------------------------------------------------------------ #
+# 1. Collect performance results                                  ####
+# ------------------------------------------------------------------ #
+performance = ssdm_results %>% 
+  bind_rows(msdm_embed_performance) %>% 
+  dplyr::group_by(species, model, metric) %>% 
+  dplyr::summarise(value = mean(value, na.rm = F)) %>% 
+  dplyr::mutate(
+    metric = stringr::str_to_lower(metric),
+    value = case_when(
+      ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5,
+      ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0,
+      .default = value
+    )
+  )
+
+# ------------------------------------------------------------------ #
+# 2. Model comparison                                             ####
+# ------------------------------------------------------------------ #
+## Overall model Comparison ####
+df_plot = performance %>% 
+  dplyr::filter(metric %in% c("auc", "f1", "kappa", "accuracy", "precision", "recall"))
+
+ggplot(df_plot, aes(x = model, y = value, color = model)) +
+  geom_boxplot(alpha = 0.5, outlier.shape = 16) + 
+  facet_wrap(~ metric, scales = "free_y") +
+  labs(
+    title = "Model Performance Across Metrics",
+    x = "Model",
+    y = "Value",
+    color = "Model",
+  ) +
+  theme_minimal(base_size = 14) +
+  theme(
+    strip.text = element_text(face = "bold"),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    legend.position = "right"
+  )
+
+ggsave("plots/publication/model_performance_across_metrics.pdf", 
+       device = "pdf", 
+       scale = 2,
+       width = 18, 
+       height = 14,
+       units = "cm")
+
+## Performance vs number of records ####
+obs_count = model_data %>% 
+  sf::st_drop_geometry() %>% 
+  dplyr::filter(present == 1, !is.na(fold_eval)) %>% 
+  dplyr::group_by(species) %>% 
+  dplyr::summarise(obs = n())
+
+df_plot = performance %>% 
+  dplyr::filter(metric %in% c("auc", "f1", "kappa", "accuracy", "precision", "recall")) %>% 
+  dplyr::left_join(obs_count, by = "species") 
+
+ggplot(df_plot, aes(x = obs, y = value, color = model, fill = model)) +
+  geom_point(alpha = 0.2) +
+  geom_smooth() + 
+  facet_wrap(~ metric, scales = "free_y") +
+  scale_x_continuous(trans = "log10") +
+  labs(
+    title = "Model Performance vs Number of Records",
+    x = "Number of Records (log scale)",
+    y = "Value",
+    color = "Model",
+    fill = "Model"
+  ) +
+  theme_minimal(base_size = 14) +
+  theme(
+    strip.text = element_text(face = "bold"),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    legend.position = "right"
+  )
+
+ggsave("plots/publication/model_performance_vs_number_of_records.pdf", 
+       device = "pdf", 
+       scale = 2,
+       width = 18, 
+       height = 14,
+       units = "cm")
+
+## Performance vs functional group ####
+load("data/r_objects/functional_groups.RData")
+
+df_plot = performance %>% 
+  dplyr::filter(metric %in% c("auc", "f1", "kappa")) %>% 
+  dplyr::left_join(functional_groups, by = c("species" = "name_matched"))
+
+ggplot(df_plot, aes(x = model, y = value, color = model)) +
+  geom_boxplot(alpha = 0.5, outlier.shape = 16) + 
+  facet_grid(rows = vars(metric), cols = vars(functional_group), scales = "free_y", switch = "both") +
+  labs(
+    title = "Model Performance Across Functional Groups",
+    x = "Model",
+    y = "Value",
+    color = "Model",
+  ) +
+  theme_minimal(base_size = 14) +
+  theme(
+    strip.text = element_text(face = "bold"),
+    panel.border = element_rect(color = "gray", fill = NA),
+    axis.text.x = element_text(angle = 45, hjust = 1),
+    legend.position = "right"
+  )
+
+ggsave("plots/publication/model_performance_vs_functional_groups.pdf", 
+       device = "pdf", 
+       scale = 2,
+       width = 18, 
+       height = 14,
+       units = "cm")
+
+# ------------------------------------------------------------------ #
+# 3. Range predictions                                            ####
+# ------------------------------------------------------------------ #
+# Define plotting function
+plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "nn", "msdm")){
+  # Species data
+  load("data/r_objects/range_maps.RData")
+  
+  pa_spec = model_data %>% 
+    dplyr::filter(species == !!spec)
+  range_spec = range_maps %>% 
+    dplyr::filter(name_matched == !!spec) %>% 
+    sf::st_transform(sf::st_crs(pa_spec))
+  
+  # Extract raster values into df
+  bbox_spec = sf::st_bbox(pa_spec) %>% 
+    expand_bbox(expansion = 0.25)
+  
+  raster_crop = terra::crop(raster_data, bbox_spec)
+  new_data = raster_crop %>% 
+    terra::values(dataframe = T, na.rm = T) %>% 
+    dplyr::mutate(species = unique(pa_spec$species)) 
+  
+  # Make predictions
+  for(algorithm in algorithms){
+    message(algorithm)
+    # Load model
+    tryCatch({
+      if(algorithm == "msdm"){
+        load("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData")
+        probabilities = predict(msdm_embed_fit, new_data, type = "response")[,1]
+        predictions = factor(round(probabilities), levels = c("0", "1"), labels = c("A", "P"))
+      } else if(algorithm == "nn") {
+        load(paste0("data/r_objects/ssdm_results/models/", spec, "_nn_fit.RData"))
+        probabilities = predict(nn_fit, new_data, type = "response")[,1]
+        predictions = factor(round(probabilities), levels = c("0", "1"), labels = c("A", "P"))
+      } else {
+        load(paste0("data/r_objects/ssdm_results/models/", spec, "_", algorithm, "_fit.RData"))
+        predictions = predict(get(paste0(algorithm, "_fit")), new_data, type = "raw")
+      }
+    }, error = function(e){
+      warning(toupper(algorithm), ": Model could not be loaded.")
+    })
+
+    raster_pred = terra::rast(raster_crop, nlyrs = 1)
+    values(raster_pred)[as.integer(rownames(new_data))] <- predictions
+    
+    # Plot
+    plot(raster_pred, 
+         type = "classes",
+         levels = c("Absent", "Present"),
+         main = paste0("Range prediction (", toupper(algorithm), "): ", spec))
+    point_colors = sapply(pa_spec$present, function(x) ifelse(x == 0, "black", "white"))
+    plot(pa_spec[,"present"], col = point_colors, add = T, pch = 16)
+    plot(range_spec, border = "white", lwd = 2, col = NA, add = T)
+  }
+}
+
+# Load raster
+raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T)
+raster_data = terra::rast(raster_filepaths) %>% 
+  terra::crop(sf::st_bbox(sa_polygon)) %>% 
+  terra::project(sf::st_crs(model_data)$input)
+
+spec = "Thyroptera tricolor"
+pdf(file = paste0("plots/range_predictions/", spec, ".pdf"))
+plot_predictions(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "msdm"))
+dev.off()
+
+# ------------------------------------------------------------------ #
+# 5. Compare msdm predictions                                     ####
+# ------------------------------------------------------------------ #
+# Check predictions for different species
+load("data/r_objects/msdm_embed_results/msdm_embed_fit.RData")
+specs = sample(unique(model_data$species), size = 2, replace = F)
+
+new_data = spatSample(raster_data, 1000, replace = F, as.df = T) %>% 
+  drop_na()
+
+new_data1 = dplyr::mutate(new_data, species = specs[1])
+new_data2 = dplyr::mutate(new_data, species = specs[2])
+
+pred1 = predict(msdm_embed_fit, new_data1, type = "response")
+pred2 = predict(msdm_embed_fit, new_data2, type = "response")
+
+pred1 == pred2
+
+# ---> identical predictions
+
+# Permute embeddings
+embeddings = coef(msdm_embed_fit)[[1]][[1]]
+msdm_embed_fit$net$e_1$parameters$weight$set_data(torch::torch_tensor(embeddings[sample.int(nrow(embeddings)), ], device = msdm_embed_fit$net$e_1$parameters$weight$device, dtype = msdm_embed_fit$net$e_1$parameters$weight$dtype ))
+
+# ---> Invalid external pointer
+
+# ------------------------------------------------------------------ #
+# 6. Compare species embeddings                                   ####
+# ------------------------------------------------------------------ #
+all_embeddings = lapply(1:5, function(fold){
+  load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData"))
+  coef(msdm_embed_fit)[[1]][[1]]
+})
+
+pairs = combn(1:5, 2, simplify = F)
+pairwise_rv = lapply(pairs, function(pair){
+  FactoMineR::coeffRV(all_embeddings[[pair[1]]], all_embeddings[[pair[2]]])  
+})
+
+rv_matrix = matrix(NA, 5, 5)
+for(i in seq_along(pairs)){
+  rv_matrix[pairs[[i]][2], pairs[[i]][1]] = pairwise_rv[[i]][["rv"]]
+}
+
+p_matrix = matrix(NA, 5, 5)
+for(i in seq_along(pairs)){
+  p_matrix[pairs[[i]][2], pairs[[i]][1]] = pairwise_rv[[i]][["p.value"]]
+}
+
+# ------------------------------------------------------------------ #
+# 7. Analyse species embeddings                                      ####
+# ------------------------------------------------------------------ #
+embeddings = coef(msdm_fit_embedding_raw)[[1]][[1]]
+species_lookup = msdm_fit_embedding_raw$data$data[,c("species", "species_int")] %>% 
+  distinct() %>% 
+  mutate(genus = stringr::str_extract(rownames(embeddings), "([a-zA-Z]+) (.*)", group = 1))
+
+rownames(embeddings) = species_lookup$species
+
+##  Dimensionality reduction        ####
+### PCA                             ####
+pca_result = prcomp(t(embeddings), scale. = TRUE)
+var_explained = pca_result$sdev^2 / sum(pca_result$sdev^2)
+plot(var_explained)
+
+# --> Variance explained is distributed rather evenly across dimensions
+# --> Dimensionality reduction probably not useful
+
+coords = pca_result$rotation[,1:2] %>%
+  magrittr::set_colnames(c("X", "Y"))
+
+df_plot = species_lookup %>% 
+  left_join(functional_groups, by = c("species" = "name_matched")) %>% 
+  bind_cols(coords)
+
+ggplot(df_plot, aes(x = X, y = Y, col = functional_group, label=genus)) +
+  geom_point() +
+  geom_text(hjust=0, vjust=0) +
+  guides(col="none")
+
+# --> PCA not informative on 2 dimensions either
+
+### TSNE                            ####
+tsne_result <- Rtsne(embeddings, verbose = FALSE)
+coords = tsne_result$Y %>%
+  magrittr::set_colnames(c("X", "Y"))
+
+df_plot = species_lookup %>% 
+  left_join(functional_groups, by = c("species" = "name_matched")) %>% 
+  bind_cols(coords)
+
+ggplot(df_plot, aes(x = X, y = Y, col = functional_group, label=genus)) +
+  geom_point() +
+  geom_text(hjust=0, vjust=0) +
+  guides(col="none")
+
+# --> TSNE not informative on 2 dimensions either
+
+
+### KNN                             ####
+embeddings_dist = as.matrix(dist(embeddings))
+
+k = 9
+knn_results = sapply(as.character(species_lookup$species), FUN = function(spec){
+  return(sort(embeddings_dist[spec,])[2:(k+1)])
+}, USE.NAMES = T, simplify = F)
+
+plot_knn_ranges = function(spec){
+  spec_df = model_data %>% 
+    dplyr::filter(species == !!spec)
+  
+  p_spec_range = ggplot() +
+    geom_sf(data = st_as_sf(sa_polygon)) +
+    geom_sf(data = spec_df) + 
+    ggtitle(paste0(spec)) +
+    theme_minimal()
+  
+  knn_df = model_data %>% 
+    dplyr::filter(species %in% c(names(knn_results[[spec]])))
+  
+  p_knn_ranges = ggplot() +
+    geom_sf(data = st_as_sf(sa_polygon)) +
+    geom_sf(data = knn_df, aes(color = species)) + 
+    facet_wrap(facets = "species", ncol = 3) +
+    theme_minimal() +
+    guides(color="none")
+  
+  p_spec_range + p_knn_ranges
+}
+
+plot_knn_ranges(sample(species_lookup$species, 1)) # Repeat this line to plot random species
+
+# Clustering
+embeddings_dist = dist(embeddings, method = "euclidean")
+hclust_complete = hclust(dist_matrix, method = "complete")
+plot(hclust_complete, main = "Complete Linkage", xlab = "", sub = "", labels = FALSE)
+
+# Phylo Correlation
+load("data/r_objects/phylo_dist.RData")
+
+species_intersect = intersect(colnames(phylo_dist), species_lookup$species)
+
+phylo_dist_subset = phylo_dist[species_intersect, species_intersect]
+
+e_indices = match(species_intersect, species_lookup$species)
+embeddings_dist_subset = as.matrix(embeddings_dist)[e_indices, e_indices]
+FactoMineR::coeffRV(embeddings_dist_subset, phylo_dist_subset)
+
+# Trait Correlation
+load("data/r_objects/func_dist.RData")
+
+species_intersect = intersect(colnames(func_dist), species_lookup$species)
+
+func_dist_subset = func_dist[species_intersect, species_intersect]
+
+e_indices = match(species_intersect, species_lookup$species)
+embeddings_dist_subset = as.matrix(embeddings_dist)[e_indices, e_indices]
+FactoMineR::coeffRV(embeddings_dist_subset, func_dist_subset)
+
+# Range Correlation
+load("data/r_objects/range_dist.RData")
+
+species_intersect = intersect(colnames(range_dist), species_lookup$species)
+
+range_dist_subset = range_dist[species_intersect, species_intersect]
+
+e_indices = match(species_intersect, species_lookup$species)
+embeddings_dist_subset = as.matrix(embeddings_dist)[e_indices, e_indices]
+FactoMineR::coeffRV(embeddings_dist_subset, range_dist_subset)
+
diff --git a/R/_publish.yml b/R/_publish.yml
index c07bacd..9d5a5d0 100644
--- a/R/_publish.yml
+++ b/R/_publish.yml
@@ -6,3 +6,7 @@
   quarto-pub:
     - id: f9cafa63-bfd4-4c00-b401-09ec137bd3ce
       url: 'https://chrkoenig.quarto.pub/sdm-performance-carsten'
+- source: 05_01_performance_analysis.qmd
+  quarto-pub:
+    - id: cfaa8a4d-31e3-410c-ba1a-399aad5582fd
+      url: 'https://chrkoenig.quarto.pub/sdm-performance-analysis-8272'
diff --git a/R/utils.R b/R/utils.R
index 265feed..b433a44 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -17,10 +17,10 @@ expand_bbox <- function(bbox, min_span = 1, expansion = 0.25) {
   }
   
   # Expand the limits, adjusting both directions correctly
-  bbox["xmin"] <- max(bbox["xmin"] - (x_expand * x_range), -180)
-  bbox["xmax"] <- min(bbox["xmax"] + (x_expand * x_range), 180)
-  bbox["ymin"] <- max(bbox["ymin"] - (y_expand * y_range), -90)
-  bbox["ymax"] <- min(bbox["ymax"] + (y_expand * y_range), 90)
+  bbox["xmin"] <- bbox["xmin"] - (x_expand * x_range)
+  bbox["xmax"] <- bbox["xmax"] + (x_expand * x_range)
+  bbox["ymin"] <- bbox["ymin"] - (y_expand * y_range)
+  bbox["ymax"] <- bbox["ymax"] + (y_expand * y_range)
   
   return(bbox)
 }
@@ -40,8 +40,8 @@ evaluate_model <- function(model, data) {
   
   # Predict probabilities
   if(class(model) %in% c("citodnn", "citodnnBootstrap")){
-    data = dplyr::select(data, any_of(all.vars(msdm_fit_embedding_raw$old_formula)))
-    probs = predict(model, as.matrix(data), type = "response")[,1]
+    data = dplyr::select(data, any_of(all.vars(model$old_formula)))
+    probs = predict(model, data, type = "response")[,1]
     preds = factor(round(probs), levels = c("0", "1"), labels = c("A", "P"))
   } else {
     probs = predict(model, data, type = "prob")$P
@@ -64,7 +64,11 @@ evaluate_model <- function(model, data) {
       Kappa = cm$overall["Kappa"],
       Precision = cm$byClass["Precision"],
       Recall = cm$byClass["Recall"],
-      F1 = cm$byClass["F1"]
+      F1 = cm$byClass["F1"],
+      TP = cm$table["P", "P"],
+      FP = cm$table["P", "A"],
+      TN = cm$table["A", "A"],
+      FN = cm$table["A", "P"]
     )
   )
 }
diff --git a/README.md b/README.md
index 7ea2759..aaaa50c 100644
--- a/README.md
+++ b/README.md
@@ -1,93 +1,3 @@
 # Symobio-modeling
 
-
-
-## Getting started
-
-To make it easy for you to get started with GitLab, here's a list of recommended next steps.
-
-Already a pro? Just edit this README.md and make it your own. Want to make it easy? [Use the template at the bottom](#editing-this-readme)!
-
-## Add your files
-
-- [ ] [Create](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#create-a-file) or [upload](https://docs.gitlab.com/ee/user/project/repository/web_editor.html#upload-a-file) files
-- [ ] [Add files using the command line](https://docs.gitlab.com/ee/gitlab-basics/add-file.html#add-a-file-using-the-command-line) or push an existing Git repository with the following command:
-
-```
-cd existing_repo
-git remote add origin https://git.idiv.de/ye87zine/symobio-modeling.git
-git branch -M main
-git push -uf origin main
-```
-
-## Integrate with your tools
-
-- [ ] [Set up project integrations](https://git.idiv.de/ye87zine/symobio-modeling/-/settings/integrations)
-
-## Collaborate with your team
-
-- [ ] [Invite team members and collaborators](https://docs.gitlab.com/ee/user/project/members/)
-- [ ] [Create a new merge request](https://docs.gitlab.com/ee/user/project/merge_requests/creating_merge_requests.html)
-- [ ] [Automatically close issues from merge requests](https://docs.gitlab.com/ee/user/project/issues/managing_issues.html#closing-issues-automatically)
-- [ ] [Enable merge request approvals](https://docs.gitlab.com/ee/user/project/merge_requests/approvals/)
-- [ ] [Set auto-merge](https://docs.gitlab.com/ee/user/project/merge_requests/merge_when_pipeline_succeeds.html)
-
-## Test and Deploy
-
-Use the built-in continuous integration in GitLab.
-
-- [ ] [Get started with GitLab CI/CD](https://docs.gitlab.com/ee/ci/quick_start/index.html)
-- [ ] [Analyze your code for known vulnerabilities with Static Application Security Testing (SAST)](https://docs.gitlab.com/ee/user/application_security/sast/)
-- [ ] [Deploy to Kubernetes, Amazon EC2, or Amazon ECS using Auto Deploy](https://docs.gitlab.com/ee/topics/autodevops/requirements.html)
-- [ ] [Use pull-based deployments for improved Kubernetes management](https://docs.gitlab.com/ee/user/clusters/agent/)
-- [ ] [Set up protected environments](https://docs.gitlab.com/ee/ci/environments/protected_environments.html)
-
-***
-
-# Editing this README
-
-When you're ready to make this README your own, just edit this file and use the handy template below (or feel free to structure it however you want - this is just a starting point!). Thanks to [makeareadme.com](https://www.makeareadme.com/) for this template.
-
-## Suggestions for a good README
-
-Every project is different, so consider which of these sections apply to yours. The sections used in the template are suggestions for most open source projects. Also keep in mind that while a README can be too long and detailed, too long is better than too short. If you think your README is too long, consider utilizing another form of documentation rather than cutting out information.
-
-## Name
-Choose a self-explaining name for your project.
-
-## Description
-Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors.
-
-## Badges
-On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge.
-
-## Visuals
-Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method.
-
-## Installation
-Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
-
-## Usage
-Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
-
-## Support
-Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc.
-
-## Roadmap
-If you have ideas for releases in the future, it is a good idea to list them in the README.
-
-## Contributing
-State if you are open to contributions and what your requirements are for accepting them.
-
-For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self.
-
-You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser.
-
-## Authors and acknowledgment
-Show your appreciation to those who have contributed to the project.
-
-## License
-For open source projects, say how it is licensed.
-
-## Project status
-If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers.
+Code and data accompanying SSDM vs MSDM model comparison
\ No newline at end of file
diff --git a/snippets.R b/snippets.R
new file mode 100644
index 0000000..40f5523
--- /dev/null
+++ b/snippets.R
@@ -0,0 +1,160 @@
+### Range coverage bias
+
+Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range.
+
+$$
+  RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)}
+  $$
+    
+    Higher bias values indicate that occurrence records are spatially more clustered within the range of the species.
+  
+  -   There was no strong relationship between range coverage bias and model performance
+  
+  ```{r range_coverage_bias, echo = FALSE, message=FALSE, warnings=FALSE, eval=F}
+  df_occs_total = occs_final %>% 
+    st_drop_geometry() %>% 
+    group_by(species) %>% 
+    summarise(occs_total = n())
+  
+  df_join = df_occs_total %>% 
+    dplyr::inner_join(df_cells_total, by = "species") %>% 
+    dplyr::inner_join(df_cells_occ, by = "species") %>% 
+    dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1)))
+  
+  df_plot = performance %>% 
+    inner_join(df_join, by = "species")
+  
+  # Calculate regression lines for each model and metric combination
+  suppressWarnings({
+    regression_lines = df_plot %>%
+      group_by(model, metric) %>%
+      group_modify(~lin_regression(.x$range_bias, .x$value))
+  })
+  
+  # Create base plot
+  plot <- plot_ly() %>% 
+    layout(
+      title = "Model Performance vs. Range coverage bias",
+      xaxis = list(title = "Range coverage bias"),
+      yaxis = list(title = "Value"),
+      legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
+      margin = list(r = 150),  # Add right margin to accommodate legend
+      hovermode = 'closest',
+      updatemenus = list(
+        list(
+          type = "dropdown",
+          active = 0,
+          buttons = plotly_buttons
+        )
+      )
+    )
+  
+  # Add regression lines and confidence intervals for each model
+  for (model_name in unique(df_plot$model)) {
+    plot <- plot %>%
+      add_markers(
+        data = filter(df_plot, model == model_name),
+        x = ~ range_bias,
+        y = ~ value,
+        color = model_name,  # Set color to match legendgroup
+        legendgroup = model_name,
+        opacity = 0.6,
+        name = ~model,
+        hoverinfo = 'text',
+        text = ~paste("Species:", species, "<br>Range coverage bias:", range_bias, "<br>Value:", round(value, 3)),
+        transforms = list(
+          list(
+            type = 'filter',
+            target = ~metric,
+            operation = '=',
+            value = focal_metrics[1]
+          )
+        )
+      )
+  }
+  
+  for (model_name in unique(df_plot$model)) {
+    reg_data = dplyr::filter(regression_lines, model == model_name)
+    plot = plot %>% 
+      add_lines(
+        data = reg_data,
+        x = ~x,
+        y = ~fit,
+        color = model_name,  # Set color to match legendgroup
+        legendgroup = model_name,
+        name = paste(model_name, '(fit)'),
+        showlegend = FALSE,
+        transforms = list(
+          list(
+            type = 'filter',
+            target = ~metric,
+            operation = '=',
+            value = focal_metrics[1]
+          )
+        )
+      )
+  }
+  
+  
+  bslib::card(plot, full_screen = T)
+  ```
+  
+  ### Functional group
+  
+  Functional groups were assigned based on taxonomic order. The following groupings were used:
+    
+    | Functional group      | Taxomic orders                                                        |
+    |-------------------|-----------------------------------------------------|
+    | large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla                    |
+    | small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha |
+    | arboreal              | Primates, Pilosa                                                      |
+    | flying                | Chiroptera                                                            |
+    
+    -   Models for bats tended to perform slightly worse than for other groups.
+  
+  ```{r functional_groups, echo = FALSE, message=FALSE, warnings=FALSE, eval=F}
+  df_plot = performance %>% 
+    dplyr::left_join(functional_groups, by = c("species" = "name_matched"))
+  
+  plot <- plot_ly(
+    data = df_plot,
+    x = ~functional_group,
+    y = ~value,
+    color = ~model,
+    type = 'box',
+    boxpoints = "all",
+    jitter = 1,
+    pointpos = 0,
+    hoverinfo = 'text',
+    text = ~paste("Species:", species, "<br>Functional group:", functional_group, "<br>Value:", round(value, 3)),
+    transforms = list(
+      list(
+        type = 'filter',
+        target = ~metric,
+        operation = '=',
+        value = focal_metrics[1]  # default value
+      )
+    )
+  )
+  
+  plot <- plot %>%
+    layout(
+      title = "Model Performance vs. Functional Group",
+      xaxis = list(title = "Functional group"),
+      yaxis = list(title = "Value"),
+      legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
+      margin = list(r = 150),  # Add right margin to accommodate legend
+      hovermode = 'closest',
+      boxmode = "group",
+      updatemenus = list(
+        list(
+          type = "dropdown",
+          active = 0,
+          buttons = plotly_buttons
+        )
+      )
+    )
+  
+  bslib::card(plot, full_screen = T)
+  ```
+  
\ No newline at end of file
-- 
GitLab