diff --git a/R/03_03_absence_preparation_random.R b/R/03_03_absence_preparation_random.R
new file mode 100644
index 0000000000000000000000000000000000000000..7986d0bf8e98e534c7160479a7c1f4e8235ea9f4
--- /dev/null
+++ b/R/03_03_absence_preparation_random.R
@@ -0,0 +1,185 @@
+# General packages
+library(dplyr)
+library(tidyr)
+library(furrr)
+
+# Geo packages
+library(terra)
+library(CoordinateCleaner)
+library(sf)
+library(geos)
+
+# Stat packages
+library(MASS)
+library(dismo)
+library(spatialEco)
+library(blockCV)
+library(caret)
+
+source("R/utils.R")
+
+load("data/r_objects/range_maps.RData")
+load("data/r_objects/occs_final.RData")
+
+# ---------------------------------------------------------------------------#
+# Prepare Geodata                                                         ####
+# ---------------------------------------------------------------------------#
+proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" 
+
+target_species = unique(occs_final$species)
+
+sa_polygon = rnaturalearth::ne_countries() %>% 
+  dplyr::filter(continent == "South America") %>% 
+  sf::st_union() 
+
+raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% 
+  stringr::str_sort(numeric = T) 
+
+sa_polygon = st_transform(sa_polygon, proj_string)
+occs_final = st_transform(occs_final, proj_string)
+range_maps = st_transform(range_maps, proj_string)
+raster_data = terra::rast(raster_filepaths) %>% 
+  terra::project(proj_string) 
+
+# ---------------------------------------------------------------------------#
+# Sample random absences                                                  ####
+#                                                                            #
+# Absences are sampled at random                                             #
+# (1) within the defined sampling area (same number as presences)            #
+# (2) outside the defined sampling area (100 records)                        #
+# ---------------------------------------------------------------------------#
+species_processed = list.files("data/r_objects/pa_sampling_random_extended/", pattern = "RData") %>% 
+  stringr::str_remove(".RData")
+
+for(species in setdiff(target_species, species_processed)){
+  print(species)
+  # Prepare species occurrence data #####
+  occs_spec = dplyr::filter(occs_final, species == !!species)
+  if(nrow(occs_spec) <= 1){
+    warning("Skipping species ", species, ": Can't process species with a single presence record." )
+  }
+  
+  occs_multipoint = occs_spec %>% 
+    summarise(geometry = st_combine(geometry))
+  
+  range_spec = dplyr::filter(range_maps, name_matched == !!species) %>% 
+    rename(species = name_matched)
+  
+  # Define core sampling region
+  # 1. Union all points from range polygon and occurrences records
+  # 2. Build concave polygon from unioned points
+  # 3. Buffer by 50 km to extend sampling region into new environments
+  # 4. Crop by South America outline
+  # 5. Crop by buffered presence points
+  sampling_region = st_cast(range_spec, "MULTIPOINT") %>% 
+    st_union(occs_multipoint) %>% 
+    geos::geos_concave_hull(ratio = 0.3) %>% # Concave hull in sf requires geos 3.11
+    st_as_sf() %>% 
+    st_set_crs(proj_string) %>% 
+    st_buffer(dist = 50000) %>% 
+    st_intersection(sa_polygon) %>% 
+    st_difference(st_buffer(occs_multipoint, dist = 50000))
+  
+  # Sample balanced number of pseudo absences inside core region
+  abs_spec_list = list()
+  
+  samples_required = nrow(occs_spec)
+  while(samples_required != 0){
+    sample_points = sf::st_sample(sampling_region, size = samples_required)
+    env_sample = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE)
+    
+    sample_abs = st_sf(env_sample, geometry = sample_points) %>% 
+      drop_na() %>% 
+      dplyr::mutate(
+        record_type = "absence_core"
+      )
+    
+    abs_spec_list[[length(abs_spec_list)+1]] = sample_abs
+    
+    samples_required = samples_required - nrow(sample_abs) # Sometimes there are no env data for sample points, so keep sampling
+  }
+  
+  # Sample 100 pseudo absences outside core region
+  background_region = st_difference(sa_polygon, sampling_region)
+  samples_required = 100
+  while(samples_required != 0){
+    sample_points = sf::st_sample(background_region, size = samples_required)
+    env_sample = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE)
+    
+    sample_abs = st_sf(env_sample, geometry = sample_points) %>% 
+      drop_na() %>% 
+      dplyr::mutate(
+        record_type = "absence_background"
+      )
+    
+    abs_spec_list[[length(abs_spec_list)+1]] = sample_abs
+    
+    samples_required = samples_required - nrow(sample_abs) # Sometimes there are no env data for sample points, so keep sampling
+  }
+  
+  # Consolidate absences
+  abs_spec = bind_rows(abs_spec_list)
+  abs_spec = abs_spec %>% 
+    bind_cols(
+      st_coordinates(st_transform(abs_spec, "EPSG:4326")) 
+    ) %>% 
+    dplyr::rename(
+      longitude = X,
+      latitude = Y
+    ) %>% 
+    dplyr::mutate(
+      species = !!species,
+      present = 0
+    ) 
+  
+  # Create presence-absence dataframe
+  pa_spec = occs_spec %>% 
+    dplyr::mutate(
+      present = 1,
+      record_type = "present"
+    ) %>% 
+    bind_rows(abs_spec)
+  
+  # ref = st_bbox(sampling_region)[c(1,3,2,4)]
+  # ggplot() +
+  #   ggtitle(species) +
+  #   geom_sf(data = st_crop(sa_polygon, ref), fill = "#756445") +
+  #   geom_sf(data = range_spec, alpha = 0, color = "black") +
+  #   geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) +
+  #   scale_color_discrete(name = "Presence") +
+  #   theme_minimal()
+  # 
+  # try({
+  #   ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling_random", device = "pdf", scale = 2)
+  # })
+  
+  # Define cross-validation folds
+  folds = tryCatch({
+    spatial_folds = suppressMessages(
+      blockCV::cv_spatial(
+        pa_spec,
+        column = "present",
+        k = 5,
+        progress = F, plot = F, report = F
+      )
+    )
+    
+    spatial_folds$folds_ids
+  }, error = function(e){
+    NA
+  })
+  
+  pa_spec$fold_eval = folds
+  
+  save(pa_spec, file = paste0("data/r_objects/pa_sampling_random_extended/", species, ".RData"))
+}
+
+# Combine presence-absence data
+files = list.files("data/r_objects/pa_sampling_random_extended/", full.names = T)
+model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% 
+  bind_rows() %>% 
+  group_by(species, present) %>%
+  mutate(weight = 1/n()) %>%
+  ungroup()
+
+save(model_data, file = "data/r_objects/model_data_random_abs_extended.RData")
diff --git a/R/03_03_dataset_exploration.R b/R/03_03_dataset_exploration.R
new file mode 100644
index 0000000000000000000000000000000000000000..ad331734226f8f071007abe5e739d5fd83df1d4b
--- /dev/null
+++ b/R/03_03_dataset_exploration.R
@@ -0,0 +1,53 @@
+library(tidyverse)
+library(sf)
+library(terra)
+
+source("R/utils.R")
+
+load("data/r_objects/model_data.RData")
+load("data/r_objects/sa_polygon.RData")
+
+sa_polygon = sf::st_transform(sa_polygon, crs(model_data))
+
+# ---------------------------------------#
+# Plot all presences in the dataset   ####
+# ---------------------------------------#
+data_extent = ext(model_data)
+
+presences = model_data %>% 
+  dplyr::filter(present == 1)
+
+template_raster <- terra::rast(data_extent, resolution = 1000)  # 1000m = 1km
+crs(template_raster) <- st_crs(model_data)$wkt
+
+point_counts <- terra::rasterize(
+  vect(presences),          
+  template_raster,           
+  field = NULL,              
+  fun = "count",             
+  background = 0             
+) %>% 
+  terra::mask(vect(sa_polygon))
+
+length(which(values(point_counts) > 0)) 
+
+# only 36858 raster cells contain at least 1 presence record 
+
+ggplot() +
+  tidyterra::geom_spatraster(data = point_counts, maxcell = 5e7) +
+  scale_fill_gradientn(
+    colors = c("black", "yellow", "darkred"),
+    values = scales::rescale(c(0, 1, max(point_counts_df$count, na.rm = TRUE))),
+    breaks = c(0, 1, max(point_counts_df$count, na.rm = TRUE)),
+    na.value = "transparent"
+  ) +
+  geom_sf(data = sa_polygon, fill = NA, color = "gray", size = 0.5) +
+  theme_minimal() +
+  coord_sf() +
+  labs(title = "Coordinate distribution and density")
+
+ggsave("coordinate_density.pdf", path = "plots/", device = "pdf", scale = 4)
+
+# ---------------------------------------#
+# Plot all presences in the dataset   ####
+# ---------------------------------------#
diff --git a/R/04_04_modelling_msdm_embed_informed.R b/R/04_04_modelling_msdm_embed_informed.R
deleted file mode 100644
index 719f2ec11a62a24705c41e52799c2bf43f4a36bd..0000000000000000000000000000000000000000
--- a/R/04_04_modelling_msdm_embed_informed.R
+++ /dev/null
@@ -1,115 +0,0 @@
-library(dplyr)
-library(tidyr)
-library(cito)
-
-source("R/utils.R")
-
-load("data/r_objects/model_data.RData")
-load("data/r_objects/func_dist.RData")
-
-model_species = intersect(model_data$species, colnames(func_dist)) 
-
-model_data = model_data %>% 
-  dplyr::filter(
-    !is.na(fold_eval),
-    species %in% !!model_species
-  ) %>% 
-  dplyr::mutate(species = as.factor(species)) %>% 
-  sf::st_drop_geometry()
-
-# ----------------------------------------------------------------------#
-# Train model                                                        ####
-# ----------------------------------------------------------------------#
-func_dist = func_dist[model_species, model_species]
-embeddings = eigen(func_dist)$vectors[,1:10]
-predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness")
-formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species, weights = embeddings)")) 
-
-# 1. Cross validation
-for(fold in 1:5){
-  # Prepare data
-  train_data = dplyr::filter(model_data, fold_eval != fold)
-  
-  # Run model
-  converged = F
-  while(!converged){
-    msdm_embed_traits_fit = dnn(
-      formula,
-      data = train_data,
-      hidden = c(200L, 200L, 200L),
-      loss = "binomial",
-      epochs = 5000, 
-      lr = 0.001,   
-      batchsize = 4096,
-      dropout = 0.25,
-      burnin = 50,
-      optimizer = config_optimizer("adam"),
-      early_stopping = 200,
-      validation = 0.2,
-      device = "cuda"
-    )
-    
-    if(min(msdm_embed_traits_fit$losses$valid_l, na.rm = T) < 0.4){
-      converged = T
-    }
-  }
-  
-  save(msdm_embed_traits_fit, file = paste0("data/r_objects/msdm_embed_traits_results/msdm_embed_traits_fit_fold", fold,".RData"))
-}
-
-# Full model
-msdm_embed_traits_fit = dnn(
-  formula,
-  data = model_data,
-  hidden = c(200L, 200L, 200L),
-  loss = "binomial",
-  epochs = 7500, 
-  lr = 0.001,   
-  baseloss = 1,
-  batchsize = 4096,
-  dropout = 0.25,
-  burnin = 500,
-  optimizer = config_optimizer("adam"),
-  early_stopping = 300,
-  validation = 0.2,
-  device = "cuda"
-)
-
-save(msdm_embed_traits_fit, file = paste0("data/r_objects/msdm_embed_traits_results/msdm_embed_traits_fit_full.RData"))
-
-# ----------------------------------------------------------------------#
-# Evaluate model                                                     ####
-# ----------------------------------------------------------------------#
-msdm_embed_traits_performance = lapply(1:5, function(fold){
-  load(paste0("data/r_objects/msdm_embed_traits_results/msdm_embed_traits_fit_fold", fold, ".RData"))
-  
-  test_data_split = model_data %>% 
-    dplyr::filter(fold_eval == fold) %>% 
-    dplyr::group_split(species)
-  
-  lapply(test_data_split, function(test_data_spec){
-    species = test_data_spec$species[1]
-    
-    performance = tryCatch({
-      evaluate_model(msdm_embed_traits_fit, test_data_spec)
-    }, error = function(e){
-      list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, 
-           Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, 
-           TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_)
-    })
-    
-    performance_summary = performance %>% 
-      as_tibble() %>% 
-      mutate(
-        species = !!species,
-        obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
-        fold_eval = !!fold,
-        model = "MSDM_embed_traits",
-      ) %>% 
-      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value")
-  }) %>% 
-    bind_rows()
-}) %>% 
-  bind_rows()
-
-save(msdm_embed_traits_performance, file = paste0("data/r_objects/msdm_embed_traits_performance.RData"))
diff --git a/R/04_05_modelling_msdm_rf.R b/R/04_05_modelling_msdm_rf.R
index 5e13f48214f0b20ef85842840f3de660d49ab775..0e349b8b083717169d4717f9014d9f2c524a5ac3 100644
--- a/R/04_05_modelling_msdm_rf.R
+++ b/R/04_05_modelling_msdm_rf.R
@@ -1,10 +1,11 @@
 library(dplyr)
 library(tidyr)
-library(cito)
+library(caret)
+library(ranger)
 
 source("R/utils.R")
 
-load("data/r_objects/model_data.RData")
+load("data/r_objects/model_data_random_abs_extended.RData")
 
 model_data = model_data %>% 
   dplyr::filter(!is.na(fold_eval)) %>% 
@@ -48,10 +49,11 @@ for(fold in 1:5){
     metric = "Accuracy",
     trControl = train_ctrl,
     tuneGrid = tune_grid,
-    num.threads = 32
+    weights = data_train$weight,
+    num.threads = 48
   )
   
-  save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold,".RData"))
+  save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData"))
 }
 
 # Full model
@@ -83,20 +85,21 @@ rf_fit = caret::train(
   tuneGrid = tune_grid
 )
 
-save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_full.RData")
+save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
 
 # ----------------------------------------------------------------------#
 # Evaluate model                                                     ####
 # ----------------------------------------------------------------------#
-msdm_rf_performance = lapply(1:5, function(fold){
-  load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold, ".RData"))
+msdm_rf_random_abs_extended_performance = lapply(1:5, function(fold){
+  load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData"))
   
-  test_data =dplyr::filter(model_data, fold_eval == fold) %>% 
+  test_data = model_data %>% 
+    dplyr::filter(fold_eval == fold, record_type != "absence_background") %>% 
     sf::st_drop_geometry()
   
   actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P"))
-  probs = predict(rf_fit, test_data, type = "prob")$P
-  preds = predict(rf_fit, test_data, type = "raw")
+  probs = predict_new(rf_fit, test_data, type = "prob")
+  preds = predict_new(rf_fit, test_data, type = "class")
   
   eval_dfs = data.frame(
     species = test_data$species,
@@ -139,12 +142,13 @@ msdm_rf_performance = lapply(1:5, function(fold){
         species = !!species,
         obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
         fold_eval = !!fold,
-        model = "MSDM_rf",
+        model = "MSDM_rf_random_abs_extended",
       ) %>% 
-      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value")
+      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") %>% 
+      drop_na()
   }) %>% 
     bind_rows()
 }) %>% 
   bind_rows()
 
-save(msdm_rf_performance, file = paste0("data/r_objects/msdm_rf_performance.RData"))
+save(msdm_rf_random_abs_extended_performance, file = paste0("data/r_objects/msdm_rf_random_abs_extended_performance.RData"))
diff --git a/R/04_06_rf_testing.R b/R/04_06_rf_testing.R
new file mode 100644
index 0000000000000000000000000000000000000000..9057285d94f4516437072457058308eb43a6eca2
--- /dev/null
+++ b/R/04_06_rf_testing.R
@@ -0,0 +1,144 @@
+library(dplyr)
+library(tidyr)
+library(caret)
+library(ranger)
+
+source("R/utils.R")
+
+load("data/r_objects/model_data_random_abs.RData")
+
+model_data = model_data %>% 
+  dplyr::filter(!is.na(fold_eval)) %>% 
+  dplyr::mutate(
+    species = as.factor(species),
+    present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))
+  ) 
+
+predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness", "species") 
+
+full_data = model_data %>% 
+  sf::st_drop_geometry()
+
+train_ctrl = caret::trainControl(
+  search = "random",
+  classProbs = TRUE, 
+  summaryFunction = caret::twoClassSummary, 
+  savePredictions = "final"
+)
+
+# Run model
+rf_fit = caret::train(
+  x = full_data[, predictors],
+  y = full_data$present_fct,
+  method = "ranger",
+  metric = "Accuracy",
+  trControl = train_ctrl,
+  tuneLength = 1,
+  num.threads = 48,
+  importance = 'impurity',
+  verbose = T
+)
+
+save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
+
+varimp = varImp(rf_fit)
+
+# Cross validation
+for(fold in 1:5){
+  ## Preparations #####
+  data_train = dplyr::filter(model_data, fold_eval != fold) %>% 
+    sf::st_drop_geometry()
+  
+  # Define caret training routine 
+  train_ctrl = caret::trainControl(
+    method = "cv",
+    number = 5,
+    classProbs = TRUE, 
+    summaryFunction = caret::twoClassSummary, 
+    savePredictions = "final"
+  )
+  
+  tune_grid = expand.grid(
+    mtry = c(2,4,6,8),
+    splitrule = "gini",
+    min.node.size = c(1,4,9,16)
+  )
+  
+  # Run model
+  rf_fit = caret::train(
+    x = data_train[, predictors],
+    y = data_train$present_fct,
+    method = "ranger",
+    metric = "Accuracy",
+    trControl = train_ctrl,
+    tuneGrid = tune_grid,
+    num.threads = 48,
+    verbose = F
+  )
+  
+  save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData"))
+}
+
+# ----------------------------------------------------------------------#
+# Evaluate model                                                     ####
+# ----------------------------------------------------------------------#
+msdm_rf_random_abs_performance = lapply(1:5, function(fold){
+  load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData"))
+  
+  test_data = dplyr::filter(model_data, fold_eval == fold) %>% 
+    sf::st_drop_geometry()
+  
+  actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P"))
+  probs = predict(rf_fit, test_data, type = "prob")$P
+  preds = predict(rf_fit, test_data, type = "raw")
+  
+  eval_dfs = data.frame(
+    species = test_data$species,
+    actual,
+    probs,
+    preds
+  ) %>% 
+    group_by(species) %>% 
+    group_split()
+  
+  
+  lapply(eval_dfs, function(eval_df_spec){
+    species = eval_df_spec$species[1]
+    
+    performance = tryCatch({
+      auc = pROC::roc(eval_df_spec$actual, eval_df_spec$probs, levels = c("P", "A"), direction = ">")$auc
+      cm = caret::confusionMatrix(eval_df_spec$preds, eval_df_spec$actual, positive = "P")
+      
+      list(
+        AUC = as.numeric(auc),
+        Accuracy = cm$overall["Accuracy"],
+        Kappa = cm$overall["Kappa"],
+        Precision = cm$byClass["Precision"],
+        Recall = cm$byClass["Recall"],
+        F1 = cm$byClass["F1"],
+        TP = cm$table["P", "P"],
+        FP = cm$table["P", "A"],
+        TN = cm$table["A", "A"],
+        FN = cm$table["A", "P"]
+      )
+    }, error = function(e){
+      list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, 
+           Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, 
+           TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_)
+    })
+    
+    performance_summary = performance %>% 
+      as_tibble() %>% 
+      mutate(
+        species = !!species,
+        obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
+        fold_eval = !!fold,
+        model = "MSDM_rf_random_abs",
+      ) %>% 
+      tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value")
+  }) %>% 
+    bind_rows()
+}) %>% 
+  bind_rows()
+
+save(msdm_rf_random_abs_performance, file = paste0("data/r_objects/msdm_rf_random_abs_performance.RData"))
diff --git a/R/05_01_performance_report.qmd b/R/05_01_performance_report.qmd
index 686a903180ac4f33b3988fc3740db968059b9c65..1c1b64d5e8fa93cc5641e9f324bb58b5002f053e 100644
--- a/R/05_01_performance_report.qmd
+++ b/R/05_01_performance_report.qmd
@@ -12,10 +12,11 @@ library(plotly)
 library(DT)
 
 load("../data/r_objects/model_data.RData")
-load("../data/r_objects/ssdm_results.RData")
 load("../data/r_objects/msdm_embed_performance.RData")
-load("../data/r_objects/msdm_onehot_performance.RData")
 load("../data/r_objects/msdm_rf_performance.RData")
+load("../data/r_objects/msdm_rf_random_abs_performance.RData")
+load("../data/r_objects/msdm_rf_no_species_performance.RData")
+load("../data/r_objects/msdm_rf_no_species_random_abs_performance.RData")
 
 load("../data/r_objects/range_maps.RData")
 load("../data/r_objects/range_maps_gridded.RData")
@@ -54,10 +55,10 @@ lin_regression = function(x, y, family = "binomial"){
 }
 
 # Performance table
-performance = ssdm_results %>% 
-  bind_rows(msdm_embed_performance) %>% 
-  bind_rows(msdm_onehot_performance) %>% 
-  bind_rows(msdm_rf_performance) %>% 
+performance = msdm_rf_performance %>% 
+  bind_rows(msdm_rf_random_abs_performance) %>% 
+  bind_rows(msdm_rf_no_species_performance) %>% 
+  bind_rows(msdm_rf_no_species_random_abs_performance) %>% 
   dplyr::filter(fold_eval <= 3) %>% # TODO use all folds
   dplyr::group_by(species, model, metric) %>% 
   dplyr::summarise(value = mean(value, na.rm = F)) %>% 
@@ -174,8 +175,8 @@ Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling).
 
 ### Key findings:
 
-- MSDM algorithms score much higher across all performance algorithms
-- Among MSDM algorithms, RF outperforms NNs significantly
+-   MSDM algorithms score much higher across all performance algorithms
+-   Among MSDM algorithms, RF outperforms NNs significantly
 
 ## Analysis
 
@@ -364,7 +365,6 @@ bslib::card(plot, full_screen = T)
 ```
 :::
 
-
 ### Range coverage bias
 
 Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range.
@@ -427,4 +427,4 @@ bslib::card(plot, full_screen = T)
 plot = plot_performance(df_plot, metric = "recall", x_var = "range_bias", x_label = "Range coverage bias", x_log = F)
 bslib::card(plot, full_screen = T)
 ```
-:::
\ No newline at end of file
+:::
diff --git a/R/05_02_publication_analysis.R b/R/05_02_publication_analysis.R
index f1b9d262008a17cde8ad3251f1b54d31c6b3f037..9e2d074e37342fadb7268e670ef90d671ef68f71 100644
--- a/R/05_02_publication_analysis.R
+++ b/R/05_02_publication_analysis.R
@@ -13,7 +13,7 @@ library(cito)
 
 source("R/utils.R")
 
-load("data/r_objects/model_data.RData")
+load("data/r_objects/model_data_random_abs_extended.RData")
 load("data/r_objects/ssdm_results.RData")
 load("data/r_objects/msdm_embed_performance.RData")
 load("data/r_objects/msdm_onehot_performance.RData")
@@ -154,7 +154,7 @@ library(cito)
 library(ranger)
 
 # Define plotting function
-plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "nn", "msdm_embed", "msdm_onehot", "msdm_rf")){
+plot_predictions = function(spec, model_data, raster_data, algorithms){
   # Species data
   load("data/r_objects/range_maps.RData")
   
@@ -165,14 +165,17 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam",
     sf::st_transform(sf::st_crs(pa_spec))
   
   # Extract raster values into df
-  bbox_spec = sf::st_bbox(pa_spec) %>% 
-    expand_bbox(expansion = 0.5)
+  bbox_spec = sf::st_bbox(range_spec) %>% 
+    expand_bbox(expansion = 0.25)
   
   raster_crop = terra::crop(raster_data, bbox_spec)
+  pa_crop = pa_spec[st_intersects(pa_spec, st_as_sfc(bbox_spec), sparse = FALSE),]
+  
   new_data = raster_crop %>% 
     terra::values(dataframe = T, na.rm = T) %>% 
     dplyr::mutate(species = unique(pa_spec$species)) 
   
+  plots = list()
   # Make predictions
   for(algorithm in algorithms){
     message(algorithm)
@@ -187,7 +190,16 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam",
       predictions = factor(round(probabilities), levels = c("0", "1"), labels = c("A", "P"))
     } else if(algorithm == "msdm_rf"){
       load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData")
-      predictions = predict(rf_fit, new_data, type = "raw")
+      predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
+    } else if(algorithm == "msdm_rf_random_abs"){
+      load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
+      predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
+    } else if(algorithm == "msdm_rf_fit_no_species"){
+      load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_full.RData")
+      predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
+    } else if(algorithm == "msdm_rf_fit_no_species_random_abs"){
+      load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_random_abs_full.RData")
+      predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
     } else if(algorithm == "nn") {
       load(paste0("data/r_objects/ssdm_results/full_models/", spec, "_nn_fit.RData"))
       new_data_tmp = dplyr::select(new_data, -species)
@@ -200,17 +212,26 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam",
     }
     
     raster_pred = terra::rast(raster_crop, nlyrs = 1)
-    values(raster_pred)[as.integer(rownames(new_data))] <- predictions
+    values(raster_pred)[as.integer(rownames(new_data))] <- as.integer(predictions) - 1
     
     # Plot
-    plot(raster_pred, 
-         type = "classes",
-         levels = c("Absent", "Present"),
-         main = paste0("Range prediction (", toupper(algorithm), "): ", spec))
-    point_colors = sapply(pa_spec$present, function(x) ifelse(x == 0, "#000000AA", "#FFFFFFAA"))
-    plot(pa_spec[,"present"], col = point_colors, add = T, pch = 16, cex = 0.7)
-    plot(range_spec, border = "red", lwd = 1.5, col = NA, add = T)
+    p = ggplot() +
+      tidyterra::geom_spatraster(data = as.factor(raster_pred), maxcell = 5e7) +
+      scale_fill_manual(values = c("0" = "black", "1" = "yellow"), name = "Prediction", na.translate = FALSE) +
+      geom_sf(data = pa_crop, aes(shape = as.factor(present)), color = "#FFFFFF99") +
+      geom_sf(data = range_spec, col = "red", fill = NA) +
+      scale_shape_manual(values = c("0" = 1, "1" = 4), name = "Observation") +
+      theme_minimal() +
+      coord_sf() +
+      labs(title = paste0("Predictions vs Observations (", algorithm, "): ", spec))  +
+      guides(shape = guide_legend(
+        override.aes = list(color = "black")  # This makes shapes black in the legend only
+      ))
+    
+    plots[[algorithm]] = p
   }
+  
+  return(plots)
 }
 
 # Load raster
@@ -221,14 +242,12 @@ raster_data = terra::rast(raster_filepaths) %>%
   terra::crop(sf::st_bbox(sa_polygon)) %>% 
   terra::project(sf::st_crs(model_data)$input)
 
-specs = sort(sample(levels(model_data$species), 30))
+specs = sort(sample(levels(model_data$species), 4))
 for(spec in specs){
-  pdf(file = paste0("plots/range_predictions/", spec, " (msdm).pdf"))
-  tryCatch({
-    plot_predictions(spec, model_data, raster_data, algorithms = c("msdm_embed", "msdm_onehot", "msdm_rf"))
-  }, finally = {
-    dev.off()
-  })
+  pdf(file = paste0("plots/range_predictions/", spec, " (msdm_rf).pdf"), width = 12)
+  plots = plot_predictions(spec, model_data, raster_data, algorithms = c("msdm_rf", "msdm_rf_random_abs", "msdm_rf_fit_no_species", "msdm_rf_fit_no_species_random_abs"))
+  lapply(plots, print)
+  dev.off()
 }
 
 # ------------------------------------------------------------------ #
@@ -237,6 +256,7 @@ for(spec in specs){
 load("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData")
 load("data/r_objects/msdm_onehot_results/msdm_onehot_fit_full.RData")
 load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData")
+load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
 
 compare_species_predictions = function(model, sample_size){
   specs = sample(unique(model_data$species), 2, replace = F)
@@ -265,7 +285,7 @@ compare_species_predictions(msdm_onehot_fit, 500)
 compare_species_predictions(rf_fit, 500) 
 
 ## --> Predictions for different species are weakly/moderately correlated in NN models (makes sense)
-## --> Predictions for different species are always highly correlated in RF model (seems problematioc)
+## --> Predictions for different species are always highly correlated in RF model (seems problematic)
 
 # ------------------------------------------------------------------ #
 # 5. Compare predictions across models                            ####
diff --git a/R/_publish.yml b/R/_publish.yml
index 91c7c692506ebfc3ec58deb69d220df70c4025d8..c18ea3e7bdbaf8fc276bc95e87377c6f205af5a4 100644
--- a/R/_publish.yml
+++ b/R/_publish.yml
@@ -1,16 +1,4 @@
-- source: 05_performance_analysis.qmd
-  quarto-pub:
-    - id: 309c45c3-1515-4985-a435-9ffa1888c5e7
-      url: 'https://chrkoenig.quarto.pub/ssdm-performance-analysis-0a06'
-- source: 05_01_performance_analysis_carsten.qmd
-  quarto-pub:
-    - id: f9cafa63-bfd4-4c00-b401-09ec137bd3ce
-      url: 'https://chrkoenig.quarto.pub/sdm-performance-carsten'
-- source: 05_01_performance_analysis.qmd
-  quarto-pub:
-    - id: cfaa8a4d-31e3-410c-ba1a-399aad5582fd
-      url: 'https://chrkoenig.quarto.pub/sdm-performance-analysis-8272'
 - source: 05_01_performance_report.qmd
   quarto-pub:
-    - id: 25ccfe7b-b9d4-4bc9-bbf8-823676aab7bd
-      url: 'https://chrkoenig.quarto.pub/sdm-performance-report-f08b'
+    - id: 77b30762-b473-447d-be18-d4bbd6261fbf
+      url: 'https://chrkoenig.quarto.pub/sdm-performance-report'
diff --git a/R/utils.R b/R/utils.R
index 1f9ec189484331ae200786eae0515028d9a1452b..d00b91a6355e24ee441a8baa1a22f5e55bbde698 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -42,18 +42,11 @@ predict_new = function(model, data, type = "prob"){
       return(probs)
     } else {
       preds = predict(model, data, type = "raw")
+      return(preds)
     }
   }
 }
 
-if(class(model) %in% c("citodnn", "citodnnBootstrap")){
-  p1 = predict(model, df1, type = "response")[,1]
-  p2 = predict(model, df2, type = "response")[,1]
-} else {
-  p1 = predict(model, df1, type = "prob")$P
-  p2 = predict(model, df2, type = "prob")$P
-}
-
 evaluate_model <- function(model, data) {
   # Accuracy: The proportion of correctly predicted instances (both true positives and true negatives) out of the total instances.
   # Formula: Accuracy = (TP + TN) / (TP + TN + FP + FN)
diff --git a/env_vars.png b/env_vars.png
new file mode 100644
index 0000000000000000000000000000000000000000..273e21027964862a1deec247aafe6ca9eaea63f4
Binary files /dev/null and b/env_vars.png differ