From c33c0424fc345f738efc284ba5b47dfde9643702 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Wed, 18 Dec 2024 22:27:33 +0100
Subject: [PATCH 1/7] protoyped kde based absence sampling

---
 R/03_01_presence_preparation.R      |  68 +++++++++++
 R/03_02_absence_preparation.R       | 174 ++++++++++++++++++++++++++++
 R/03_presence_absence_preparation.R | 160 -------------------------
 3 files changed, 242 insertions(+), 160 deletions(-)
 create mode 100644 R/03_01_presence_preparation.R
 create mode 100644 R/03_02_absence_preparation.R
 delete mode 100644 R/03_presence_absence_preparation.R

diff --git a/R/03_01_presence_preparation.R b/R/03_01_presence_preparation.R
new file mode 100644
index 0000000..c8705b9
--- /dev/null
+++ b/R/03_01_presence_preparation.R
@@ -0,0 +1,68 @@
+# General packages
+library(dplyr)
+library(tidyr)
+library(furrr)
+
+# Geo packages
+library(terra)
+library(CoordinateCleaner)
+library(sf)
+
+# DB packages
+library(Symobio)
+library(DBI)
+
+source("R/utils.R")
+
+con = db_connect() # Connection to Symobio
+sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry
+
+# ---------------------------------------------------------------------------#
+# Prepare Geodata                                                         ####
+# ---------------------------------------------------------------------------#
+# TODO: finalize predictor choice
+raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% 
+  stringr::str_sort(numeric = T)
+
+sa_polygon = rnaturalearth::ne_countries() %>% 
+  dplyr::filter(continent == "South America") %>% 
+  sf::st_union()
+
+# ---------------------------------------------------------------------------#
+# Prepare Occurrence Data                                                 ####
+# ---------------------------------------------------------------------------#
+load("data/r_objects/range_maps.RData")
+target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)])
+
+# Query species from Symobio
+occs = tbl(con, "species_occurrences") %>% 
+  dplyr::filter(species %in% target_species) %>% 
+  dplyr::select(-year) %>% 
+  dplyr::distinct() %>% 
+  collect() %>% 
+  sf::st_as_sf(coords = c("longitude", "latitude"), remove = F, crs = sf::st_crs(4326)) %>% 
+  sf::st_filter(sa_polygon)
+
+# Flag occurrences using Coordinate cleaner
+occs_flagged = occs %>% 
+  dplyr::distinct(species, coordinate_id, longitude, latitude) %>% 
+  group_by(species) %>% 
+  group_split() %>% 
+  purrr::map(                     # Loop over species individually due to bug in CoordinateCleaner
+    CoordinateCleaner::clean_coordinates,
+    lon = "longitude", 
+    lat = "latitude",
+    tests = c("centroids", "gbif", "equal", "institutions", "outliers"),
+    outliers_method = "quantile",
+    verbose = F
+  ) %>% 
+  bind_rows() %>% 
+  dplyr::filter(.summary == T) %>% 
+  dplyr::select(species, coordinate_id, longitude, latitude)
+
+# Finalize and save occurrence dataset
+occs_final = occs %>% 
+  inner_join(occs_flagged, by = c("species", "coordinate_id", "longitude", "latitude")) %>% 
+  dplyr::select(-coordinate_id)
+
+save(occs_final, file = "data/r_objects/occs_final.RData")
\ No newline at end of file
diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R
new file mode 100644
index 0000000..b72d132
--- /dev/null
+++ b/R/03_02_absence_preparation.R
@@ -0,0 +1,174 @@
+# General packages
+library(dplyr)
+library(tidyr)
+library(furrr)
+
+# Geo packages
+library(terra)
+library(CoordinateCleaner)
+library(sf)
+library(MASS)
+library(geos)
+
+# DB packages
+library(Symobio)
+library(DBI)
+
+source("R/utils.R")
+
+load("data/r_objects/range_maps.RData")
+load("data/r_objects/occs_final.RData")
+
+# ---------------------------------------------------------------------------#
+# Prepare Geodata                                                         ####
+# ---------------------------------------------------------------------------#
+target_species = unique(occs_final$species)
+
+sa_polygon = rnaturalearth::ne_countries() %>% 
+  dplyr::filter(continent == "South America") %>% 
+  sf::st_union() 
+
+raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice
+  stringr::str_sort(numeric = T) 
+
+raster_data = terra::rast(raster_filepaths) %>% 
+  terra::crop(sf::st_bbox(sa_polygon))
+
+# Project (proj string generated with SA bbox coordinates on https://projectionwizard.org)
+proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" 
+
+raster_data = terra::project(raster_data, proj_string)
+sa_polygon = st_transform(sa_polygon, proj_string)
+occs_final = st_transform(occs_final, proj_string)
+range_maps = st_transform(range_maps, proj_string)
+
+# ---------------------------------------------------------------------------#
+# 1. Distance-based sampling                                              ####
+#                                                                            #
+# Absences should be predominantly sampled from regions that are             #
+# geographically close (reproduce sampling biases) but environmentally       #
+# dissimilar (avoid false negatives) to the known occurrences                #
+# ---------------------------------------------------------------------------#
+# Geographic distance
+future::plan("multisession", workers = 16)
+pseudo_absences = furrr::future_map(
+  .x = target_species, 
+  .options = furrr::furrr_options(seed = 42), 
+  .f = function(species){
+    # Prepare occurrences
+    occs_spec = dplyr::filter(occs_final, species == !!species)
+    
+    occs_poly = occs_spec %>% 
+      summarise(geometry = st_combine(geometry)) %>%
+      st_concave_hull(ratio = 0.5)
+    
+    occs_bff = occs_spec %>% 
+      st_buffer(10000) %>% # Buffer by 10 kilometers as exclusion radius for absence sampling
+      st_union()
+    
+    range_spec = dplyr::filter(range_maps, name_matched == !!species) %>% 
+      rename(species = name_matched)
+    
+    # Define model/sampling region
+    sampling_region = st_union(range_spec, occs_poly) %>% # Union occs poly and expert range
+      st_buffer(dist = 100000) %>% # Buffer by 100 km to extend sampling region into new environments
+      st_intersection(sa_polygon) %>% # Crop to South America 
+      st_difference(occs_bff) # Crop out buffered presences
+    
+    # Define KDE
+    ref = st_bbox(sampling_region)[c(1,3,2,4)]
+    min_extent = min(ref[2]-ref[1], ref[4]-ref[3])
+    occs_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% 
+      crop(sampling_region, mask = T, touches = F)
+    
+    # Sampling of pseudo absence
+    abs_spec = st_sf(geometry = st_sfc(), crs = proj_string)
+    while(nrow(abs_spec) < nrow(occs_spec)){
+      sample_points_raw = st_sample(sampling_region, nrow(occs_spec))
+      p = extract(occs_kde, vect(sample_points_raw))$z
+      x = runif(length(p))
+      
+      sample_points = sample_points_raw[x<p]
+      samples_required = nrow(occs_spec) - nrow(abs_spec)
+      
+      if(length(sample_points) > samples_required){
+        sample_points = sample(sample_points, samples_required)
+      }
+      
+      abs_spec = bind_rows(abs_spec, st_sf(geometry = sample_points)) 
+    }
+    
+    plot(sa_polygon)
+    plot(occs_kde, add = T, legend = F)
+    plot(range_spec, add = T, col = "#00000000")
+    plot(abs_spec, add = T, col = "black", pch = 3, cex = 0.5)
+    plot(occs_spec, add = T, col = "white", pch = 3, cex = 0.5)
+    
+    
+    sample_points = st_as_sf(st_sample(sample_region, nrow(abs_spec))) %>% 
+      dplyr::mutate(
+        species = occs_spec$species[1],
+        longitude = sf::st_coordinates(.)[,1],
+        latitude = sf::st_coordinates(.)[,2]
+      ) %>% 
+      dplyr::rename(geometry = "x")
+    
+    abs_spec = terra::rast(raster_filepaths) %>% 
+      setNames(paste0("layer_", 1:19)) %>% 
+      terra::extract(sample_points) %>% 
+      dplyr::select(-ID) %>% 
+      dplyr::mutate(
+        present = 0,
+        geometry = sample_points$x
+      ) %>% 
+      tibble() %>% 
+      bind_cols(sample_points)
+    
+    # Create presence-absence dataframe
+    pa_spec = occs_spec %>% 
+      dplyr::mutate(present = 1) %>% 
+      bind_rows(abs_spec) 
+    
+    # Split into train and test datasets
+    train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE)
+    pa_spec$train = 0
+    pa_spec$train[train_index] = 1
+    
+    # Define cross-validation folds
+    folds = tryCatch({
+      spatial_folds = suppressMessages(
+        blockCV::cv_spatial(
+          pa_spec,
+          column = "present",
+          k = 5,
+          progress = F, plot = F, report = F
+        )
+      )
+      
+      spatial_folds$folds_ids
+    }, warning = function(w){
+      NA
+    }, error = function(e){
+      NA
+    })
+    
+    pa_spec$folds = folds
+    pa_spec$geometry = NULL
+    
+    return(pa_spec)
+  })
+
+model_data = bind_rows(model_data)
+save(model_data, file = "data/r_objects/model_data.RData")
+
+
+
+
+ggplot() +
+  geom_sf(data = sa_polygon, fill = "red") +
+  geom_sf(data = range_spec, fill = "green") + 
+  geom_sf(data = occs_spec) +
+  #   facet_wrap("species", nrow = 2) + 
+  #   theme_minimal()
+  #   
+  
\ No newline at end of file
diff --git a/R/03_presence_absence_preparation.R b/R/03_presence_absence_preparation.R
deleted file mode 100644
index 9fd817c..0000000
--- a/R/03_presence_absence_preparation.R
+++ /dev/null
@@ -1,160 +0,0 @@
-# General packages
-library(dplyr)
-library(tidyr)
-library(furrr)
-
-# Geo packages
-library(terra)
-library(CoordinateCleaner)
-library(sf)
-
-# DB packages
-library(Symobio)
-library(DBI)
-
-source("R/utils.R")
-
-con = db_connect() # Connection to Symobio
-sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry
-
-# ---------------------------------------------------------------------------#
-# Prepare Geodata                                                         ####
-# ---------------------------------------------------------------------------#
-raster_info = tbl(con, "datasets") %>% 
-  dplyr::filter(stringr::str_detect(name, "CHELSA")) %>% 
-  collect()
-
-# raster_filepaths = list.files(raster_info$raw_data_path, pattern = ".tif$", full.names = T) %>% 
-#   stringr::str_sort(numeric = T)
-
-raster_filepaths = list.files("I:/mas_data/00_data/processed/CHELSA_v2-1_bioclim", pattern = ".tif$", full.names = T) %>% 
-  stringr::str_sort(numeric = T)
-
-sa_polygon = rnaturalearth::ne_countries() %>% 
-  dplyr::filter(continent == "South America") %>% 
-  sf::st_union()
-
-# ---------------------------------------------------------------------------#
-# Prepare Occurrence Data                                                 ####
-# ---------------------------------------------------------------------------#
-load("data/r_objects/range_maps.RData")
-target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)])
-
-occs = tbl(con, "species_occurrences") %>% 
-  dplyr::filter(species %in% target_species) %>% 
-  dplyr::select(-year) %>% 
-  dplyr::distinct() %>% 
-  collect() %>% 
-  sf::st_as_sf(coords = c("longitude", "latitude"), remove = F, crs = sf::st_crs(4326)) %>% 
-  sf::st_filter(sa_polygon)
-
-occs_flagged = occs %>% 
-  dplyr::distinct(species, coordinate_id, longitude, latitude) %>% 
-  group_by(species) %>% 
-  group_split() %>% 
-  purrr::map(                     # Loop over species individually due to bug in CoordinateCleaner
-    CoordinateCleaner::clean_coordinates,
-    lon = "longitude", 
-    lat = "latitude",
-    tests = c("centroids", "gbif", "equal", "institutions", "outliers"),
-    outliers_method = "quantile",
-    verbose = F
-  ) %>% 
-  bind_rows() %>% 
-  dplyr::filter(.summary == T) %>% 
-  dplyr::select(species, coordinate_id, longitude, latitude)
-
-env_vars = tbl(con, "raster_extracts_num") %>% 
-  dplyr::filter(
-    coordinate_id %in% occs$coordinate_id,
-    metric == "mean"
-  ) %>% 
-  arrange(raster_layer_id) %>% 
-  tidyr::pivot_wider(id_cols = coordinate_id, names_from = raster_layer_id, names_prefix = "layer_", values_from = value) %>% 
-  collect()
-
-occs_final = occs %>% 
-  inner_join(occs_flagged, by = c("species", "coordinate_id", "longitude", "latitude")) %>% 
-  inner_join(env_vars, by = "coordinate_id") %>% 
-  dplyr::select(-coordinate_id)
-
-save(occs_final, file = "data/r_objects/occs_final.RData")
-
-# ---------------------------------------------------------------------------#
-# Create Modeling dataset                                                     ####
-# ---------------------------------------------------------------------------#
-load("data/r_objects/occs_final.RData")
-load("data/r_objects/sa_polygon.RData")
-sf::sf_use_s2(use_s2 = FALSE)
-
-occs_split = split(occs_final, occs_final$species)
-
-future::plan("multisession", workers = 16)
-model_data = furrr::future_map(occs_split, .progress = TRUE, .options = furrr::furrr_options(seed = 42), .f = function(occs_spec){
-  # Define model/sampling region
-  occs_bbox = occs_spec %>% 
-    sf::st_bbox() %>% 
-    expand_bbox(min_span = 1, expansion = 0.25) %>% 
-    sf::st_as_sfc() %>% 
-    st_set_crs(st_crs(occs_spec)) 
-  
-  sample_region = suppressMessages(
-    st_as_sf(st_intersection(occs_bbox, sa_polygon))
-  )
-  
-  # Sample pseudo absence
-  sample_points = st_as_sf(st_sample(sample_region, nrow(occs_spec))) %>% 
-    dplyr::mutate(
-      species = occs_spec$species[1],
-      longitude = sf::st_coordinates(.)[,1],
-      latitude = sf::st_coordinates(.)[,2]
-    ) %>% 
-    dplyr::rename(geometry = "x")
-  
-  abs_spec = terra::rast(raster_filepaths) %>% 
-    setNames(paste0("layer_", 1:19)) %>% 
-    terra::extract(sample_points) %>% 
-    dplyr::select(-ID) %>% 
-    dplyr::mutate(
-      present = 0,
-      geometry = sample_points$x
-    ) %>% 
-    tibble() %>% 
-    bind_cols(sample_points)
-  
-  # Create presence-absence dataframe
-  pa_spec = occs_spec %>% 
-    dplyr::mutate(present = 1) %>% 
-    bind_rows(abs_spec) 
-  
-  # Split into train and test datasets
-  train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE)
-  pa_spec$train = 0
-  pa_spec$train[train_index] = 1
-  
-  # Define cross-validation folds
-  folds = tryCatch({
-    spatial_folds = suppressMessages(
-      blockCV::cv_spatial(
-        pa_spec,
-        column = "present",
-        k = 5,
-        progress = F, plot = F, report = F
-      )
-    )
-    
-    spatial_folds$folds_ids
-  }, warning = function(w){
-    NA
-  }, error = function(e){
-    NA
-  })
-  
-  pa_spec$folds = folds
-  pa_spec$geometry = NULL
-  
-  return(pa_spec)
-})
-
-model_data = bind_rows(model_data)
-save(model_data, file = "data/r_objects/model_data.RData")
-- 
GitLab


From e32d220e9b11a5e4d6d6b69d29ef15b9695bfd9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Thu, 19 Dec 2024 14:41:28 +0100
Subject: [PATCH 2/7] work on reverse niche

---
 .gitignore                    |  1 +
 R/03_02_absence_preparation.R | 58 +++++++++++++++++++++++++++--------
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index ac6cec0..3acf079 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,5 +11,6 @@ renv/cache/
 
 # Data files
 data/
+plots/
 R/*/
 R/*.html
\ No newline at end of file
diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R
index b72d132..5a34407 100644
--- a/R/03_02_absence_preparation.R
+++ b/R/03_02_absence_preparation.R
@@ -7,9 +7,13 @@ library(furrr)
 library(terra)
 library(CoordinateCleaner)
 library(sf)
-library(MASS)
 library(geos)
 
+# Stat packages
+library(MASS)
+library(dismo)
+library(spatialEco)
+
 # DB packages
 library(Symobio)
 library(DBI)
@@ -49,18 +53,16 @@ range_maps = st_transform(range_maps, proj_string)
 # geographically close (reproduce sampling biases) but environmentally       #
 # dissimilar (avoid false negatives) to the known occurrences                #
 # ---------------------------------------------------------------------------#
-# Geographic distance
 future::plan("multisession", workers = 16)
 pseudo_absences = furrr::future_map(
   .x = target_species, 
   .options = furrr::furrr_options(seed = 42), 
   .f = function(species){
-    # Prepare occurrences
+    # Prepare species occurrence data #####
     occs_spec = dplyr::filter(occs_final, species == !!species)
     
-    occs_poly = occs_spec %>% 
-      summarise(geometry = st_combine(geometry)) %>%
-      st_concave_hull(ratio = 0.5)
+    occs_combined = occs_spec %>% 
+      summarise(geometry = st_combine(geometry))
     
     occs_bff = occs_spec %>% 
       st_buffer(10000) %>% # Buffer by 10 kilometers as exclusion radius for absence sampling
@@ -70,17 +72,47 @@ pseudo_absences = furrr::future_map(
       rename(species = name_matched)
     
     # Define model/sampling region
-    sampling_region = st_union(range_spec, occs_poly) %>% # Union occs poly and expert range
-      st_buffer(dist = 100000) %>% # Buffer by 100 km to extend sampling region into new environments
-      st_intersection(sa_polygon) %>% # Crop to South America 
-      st_difference(occs_bff) # Crop out buffered presences
+    # 1. Union all point coordinates from range polygon and occurrences records
+    # 2. Build concave polygon from unionen points
+    # 3. Buffer by 100 km to extend sampling region into new environments
+    # 4. Crop by South America outline
+    sampling_region = st_cast(range_spec, "MULTIPOINT") %>% 
+      st_union(occs_combined) %>% 
+      geos::geos_concave_hull(ratio = 0.3) %>% # Concave hull in sf requires geos 3.11
+      st_as_sf() %>% 
+      st_set_crs(proj_string) %>% 
+      st_buffer(dist = 100000) %>% 
+      st_intersection(sa_polygon) 
     
-    # Define KDE
+    # Define spatial KDE
     ref = st_bbox(sampling_region)[c(1,3,2,4)]
     min_extent = min(ref[2]-ref[1], ref[4]-ref[3])
     occs_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% 
       crop(sampling_region, mask = T, touches = F)
     
+    # Environmental KDE
+    env_sampling_region = raster::raster(terra::crop(raster_data, sampling_region, mask = T))
+    maxent_fit <- maxent(env_sampling_region, st_coordinates(occs_spec))
+
+# Step 3: Predict Across the Raster
+# Predict environmental similarity (continuous output)
+env_pred <- predict(maxent_model, env_sampling_region)
+    
+    
+    svm_model <- e1071::svm(x = env_spec,
+                            nu = 0.01,
+                            gamma = 0.00005,
+                            type = 'one-classification',
+                            kernel = "radial",
+                            scale = FALSE)
+    
+    env_pred <- terra::predict(env_sampling_region, fun = function(x) {
+      dnorm(x, mean = gmm_model$parameters$mean, sd = gmm_model$parameters$variance$sigma)
+    })
+    
+    env_pred = predict(env_sampling_region, svm_model, na.rm = T)
+    plot(env_pred)
+    
     # Sampling of pseudo absence
     abs_spec = st_sf(geometry = st_sfc(), crs = proj_string)
     while(nrow(abs_spec) < nrow(occs_spec)){
@@ -99,10 +131,10 @@ pseudo_absences = furrr::future_map(
     }
     
     plot(sa_polygon)
-    plot(occs_kde, add = T, legend = F)
+    #plot(occs_kde, add = T, legend = F)
     plot(range_spec, add = T, col = "#00000000")
+    plot(occs_spec, add = T, col = "red", pch = 3, cex = 0.5)
     plot(abs_spec, add = T, col = "black", pch = 3, cex = 0.5)
-    plot(occs_spec, add = T, col = "white", pch = 3, cex = 0.5)
     
     
     sample_points = st_as_sf(st_sample(sample_region, nrow(abs_spec))) %>% 
-- 
GitLab


From f1914575bd20253a094c33854a6c89fc4102a4d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Fri, 3 Jan 2025 15:49:20 +0100
Subject: [PATCH 3/7] environmental variable selection

---
 ...ration.R => 01_01_range_map_preparation.R} |   0
 R/01_02_raster_preparation.R                  |  55 ++++
 renv.lock                                     | 295 +++++++++++++++++-
 3 files changed, 348 insertions(+), 2 deletions(-)
 rename R/{01_range_map_preparation.R => 01_01_range_map_preparation.R} (100%)
 create mode 100644 R/01_02_raster_preparation.R

diff --git a/R/01_range_map_preparation.R b/R/01_01_range_map_preparation.R
similarity index 100%
rename from R/01_range_map_preparation.R
rename to R/01_01_range_map_preparation.R
diff --git a/R/01_02_raster_preparation.R b/R/01_02_raster_preparation.R
new file mode 100644
index 0000000..fae113f
--- /dev/null
+++ b/R/01_02_raster_preparation.R
@@ -0,0 +1,55 @@
+library(tidyverse)
+library(terra)
+library(rnaturalearth)
+library(furrr)
+
+# Define Study extent ####
+study_extent = rnaturalearth::ne_countries() %>% 
+  dplyr::filter(continent == "South America") %>% 
+  sf::st_union() %>% 
+  sf::st_bbox() %>% 
+  ext()
+
+# CHELSA bioclim target variables ####
+chelsa_urls = c("https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_cmi_mean_1981-2010_V.2.1.tif",
+                "https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_rsds_1981-2010_mean_V.2.1.tif",
+                "https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_bio6_1981-2010_V.2.1.tif",
+                "https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_bio17_1981-2010_V.2.1.tif")
+
+chelsa = terra::rast(chelsa_urls) %>% 
+  crop(study_extent)
+set.names(chelsa, c("cmi", "rsds", "bio6", "bio17"))
+terra::writeRaster(chelsa, filename = file.path("data", "geospatial", "raster", basename(chelsa_urls)), overwrite = T)
+
+# Tree canopy cover ####
+igfc = list.files("I:/mas_data/00_data/processed/iGFC/", pattern = "canopyDensity_(199[0-9]|200[0-9]|2010)", full.names = TRUE) %>% 
+  terra::rast() %>% 
+  terra::resample(chelsa) %>%   
+  app(mean, na.rm = TRUE)
+terra::set.names(igfc, "igfc")
+terra::writeRaster(igfc, filename = file.path("data", "geospatial", "raster", "mean_canopyDensity_1992-2010.tif"), overwrite = T)
+
+# Seasonal inundation ####
+igsw = list.files("I:/mas_data/00_data/processed/iGSW/", pattern = "seasonal_(199[0-9]|200[0-9]|2010)", full.names = TRUE) %>% 
+  terra::rast() %>% 
+  terra::resample(chelsa) %>%   
+  app(mean, na.rm = TRUE)
+terra::set.names(igsw, "igsw")
+terra::writeRaster(igsw, filename = file.path("data", "geospatial", "raster", "mean_seasonalInundation_1992-2010.tif"), overwrite = T)
+
+# Distance to freshwater ####
+dtfw = list.files("I:/mas_data/00_data/processed/linearDistance/", pattern = "lake|reservoirs|river", full.names = TRUE) %>% 
+  terra::rast() %>% 
+  crop(study_extent) %>% 
+  min(na.rm = T)  %>% 
+  terra::resample(chelsa)
+terra::set.names(dtfw, "dtfw")
+terra::writeRaster(dtfw, filename = file.path("data", "geospatial", "raster", "mean_distanceToWater.tif"), overwrite = T)
+
+# Terrain ####
+roughness = list.files("C:/Users/ye87zine/Downloads", pattern = "roughness", full.names = T) %>% 
+  terra::rast() %>% 
+  crop(study_extent) %>% 
+  terra::resample(chelsa)
+terra::set.names(roughness, "roughness")
+terra::writeRaster(roughness, filename = file.path("data", "geospatial", "raster", "terrain_roughness.tif"), overwrite = T)
diff --git a/renv.lock b/renv.lock
index 1a9a7b4..ff0fc5a 100644
--- a/renv.lock
+++ b/renv.lock
@@ -8,7 +8,29 @@
       }
     ]
   },
+  "Bioconductor": {
+    "Version": "3.18"
+  },
   "Packages": {
+    "BiocManager": {
+      "Package": "BiocManager",
+      "Version": "1.30.25",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "utils"
+      ],
+      "Hash": "3aec5928ca10897d7a0a1205aae64627"
+    },
+    "BiocVersion": {
+      "Package": "BiocVersion",
+      "Version": "3.18.1",
+      "Source": "Bioconductor",
+      "Requirements": [
+        "R"
+      ],
+      "Hash": "2ecaed86684f5fae76ed5530f9d29c4a"
+    },
     "CoordinateCleaner": {
       "Package": "CoordinateCleaner",
       "Version": "3.0.1",
@@ -42,6 +64,17 @@
       ],
       "Hash": "065ae649b05f1ff66bb0c793107508f5"
     },
+    "DEoptim": {
+      "Package": "DEoptim",
+      "Version": "2.2-8",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "methods",
+        "parallel"
+      ],
+      "Hash": "3d75dd10e7228c9d734237dda013ed90"
+    },
     "DT": {
       "Package": "DT",
       "Version": "0.33",
@@ -211,6 +244,25 @@
       ],
       "Hash": "2288423bb0f20a457800d7fc47f6aa54"
     },
+    "ape": {
+      "Package": "ape",
+      "Version": "5.8",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "Rcpp",
+        "digest",
+        "graphics",
+        "lattice",
+        "methods",
+        "nlme",
+        "parallel",
+        "stats",
+        "utils"
+      ],
+      "Hash": "16b5ff4dff0ead9ea955f62f794b1535"
+    },
     "arrow": {
       "Package": "arrow",
       "Version": "17.0.0.1",
@@ -411,7 +463,7 @@
     },
     "caret": {
       "Package": "caret",
-      "Version": "6.0-94",
+      "Version": "7.0-1",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
@@ -433,7 +485,7 @@
         "utils",
         "withr"
       ],
-      "Hash": "528692344d5a174552e3bf7acdfbaebd"
+      "Hash": "304c0d28bda6454aae3b14ae953e7824"
     },
     "cellranger": {
       "Package": "cellranger",
@@ -560,6 +612,28 @@
       ],
       "Hash": "5edbbabab6ce0bf7900a74fd4358628e"
     },
+    "clusterGeneration": {
+      "Package": "clusterGeneration",
+      "Version": "1.3.8",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "MASS",
+        "R"
+      ],
+      "Hash": "62df4b1fe7abefcf921dd7e52a801a6b"
+    },
+    "coda": {
+      "Package": "coda",
+      "Version": "0.19-4.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "lattice"
+      ],
+      "Hash": "af436915c590afc6fffc3ce3a5be1569"
+    },
     "codetools": {
       "Package": "codetools",
       "Version": "0.2-19",
@@ -584,6 +658,20 @@
       ],
       "Hash": "d954cb1c57e8d8b756165d7ba18aa55a"
     },
+    "combinat": {
+      "Package": "combinat",
+      "Version": "0.0-8",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "f0acb9dcb71a9cd9d5ae233c5035b1c5"
+    },
+    "commonmark": {
+      "Package": "commonmark",
+      "Version": "1.9.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "14eb0596f987c71535d07c3aff814742"
+    },
     "conflicted": {
       "Package": "conflicted",
       "Version": "1.2.0",
@@ -744,6 +832,20 @@
       ],
       "Hash": "33698c4b3127fc9f506654607fb73676"
     },
+    "doParallel": {
+      "Package": "doParallel",
+      "Version": "1.0.17",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "foreach",
+        "iterators",
+        "parallel",
+        "utils"
+      ],
+      "Hash": "451e5edf411987991ab6a5410c45011f"
+    },
     "dplyr": {
       "Package": "dplyr",
       "Version": "1.1.4",
@@ -850,6 +952,17 @@
       ],
       "Hash": "3db813596387e90573ad092d5e3fde37"
     },
+    "expm": {
+      "Package": "expm",
+      "Version": "1.0-0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "Matrix",
+        "methods"
+      ],
+      "Hash": "a44b2810f36c1cda5d52eee6ec96cafa"
+    },
     "fansi": {
       "Package": "fansi",
       "Version": "1.0.6",
@@ -876,6 +989,16 @@
       "Repository": "CRAN",
       "Hash": "aa5e1cd11c2d15497494c5292d7ffcc8"
     },
+    "fastmatch": {
+      "Package": "fastmatch",
+      "Version": "1.1-4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R"
+      ],
+      "Hash": "8c406b7284bbaef08e01c6687367f195"
+    },
     "filelock": {
       "Package": "filelock",
       "Version": "1.0.3",
@@ -1572,6 +1695,18 @@
       ],
       "Hash": "7ce2733a9826b3aeb1775d56fd305472"
     },
+    "maps": {
+      "Package": "maps",
+      "Version": "3.4.2.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "graphics",
+        "utils"
+      ],
+      "Hash": "354a8094c634031421eeb5103f2e1f80"
+    },
     "memoise": {
       "Package": "memoise",
       "Version": "2.0.1",
@@ -1620,6 +1755,16 @@
       ],
       "Hash": "785ef8e22389d4a7634c6c944f2dc07d"
     },
+    "mnormt": {
+      "Package": "mnormt",
+      "Version": "2.1.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R"
+      ],
+      "Hash": "c83992ef63553d1e4b97162a4a753470"
+    },
     "modelr": {
       "Package": "modelr",
       "Version": "0.1.11",
@@ -1716,6 +1861,18 @@
       ],
       "Hash": "d413e0fef796c9401a4419485f709ca1"
     },
+    "optimParallel": {
+      "Package": "optimParallel",
+      "Version": "1.0-2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "parallel",
+        "stats"
+      ],
+      "Hash": "5a15e068f0bd9bac4252f8f9e11dc79f"
+    },
     "pROC": {
       "Package": "pROC",
       "Version": "1.18.5",
@@ -1767,6 +1924,62 @@
       ],
       "Hash": "abf0ca85c1c752e0d04f46334e635046"
     },
+    "phangorn": {
+      "Package": "phangorn",
+      "Version": "2.12.1",
+      "Source": "Bioconductor",
+      "Repository": "CRAN",
+      "Requirements": [
+        "Matrix",
+        "R",
+        "Rcpp",
+        "ape",
+        "digest",
+        "fastmatch",
+        "generics",
+        "grDevices",
+        "graphics",
+        "igraph",
+        "methods",
+        "parallel",
+        "quadprog",
+        "stats",
+        "utils"
+      ],
+      "Hash": "eea536597b96f3df44a2d6b963ac280e"
+    },
+    "phytools": {
+      "Package": "phytools",
+      "Version": "2.3-0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "DEoptim",
+        "MASS",
+        "R",
+        "ape",
+        "clusterGeneration",
+        "coda",
+        "combinat",
+        "doParallel",
+        "expm",
+        "foreach",
+        "grDevices",
+        "graphics",
+        "maps",
+        "methods",
+        "mnormt",
+        "nlme",
+        "numDeriv",
+        "optimParallel",
+        "parallel",
+        "phangorn",
+        "scatterplot3d",
+        "stats",
+        "utils"
+      ],
+      "Hash": "592feaaac935ea62afa4399fd83201fd"
+    },
     "pillar": {
       "Package": "pillar",
       "Version": "1.9.0",
@@ -1969,6 +2182,16 @@
       ],
       "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc"
     },
+    "quadprog": {
+      "Package": "quadprog",
+      "Version": "1.5-8",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R"
+      ],
+      "Hash": "5f919ae5e7f83a6f91dcf2288943370d"
+    },
     "ragg": {
       "Package": "ragg",
       "Version": "1.3.3",
@@ -2305,6 +2528,19 @@
       ],
       "Hash": "c19df082ba346b0ffa6f833e92de34d1"
     },
+    "scatterplot3d": {
+      "Package": "scatterplot3d",
+      "Version": "0.3-44",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "grDevices",
+        "graphics",
+        "stats"
+      ],
+      "Hash": "10ee4b91ec812690bd55d9bf51edccee"
+    },
     "secretbase": {
       "Package": "secretbase",
       "Version": "1.0.3",
@@ -2364,6 +2600,49 @@
       ],
       "Hash": "5c47e84dc0a3ca761ae1d307889e796d"
     },
+    "shiny": {
+      "Package": "shiny",
+      "Version": "1.9.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "R6",
+        "bslib",
+        "cachem",
+        "commonmark",
+        "crayon",
+        "fastmap",
+        "fontawesome",
+        "glue",
+        "grDevices",
+        "htmltools",
+        "httpuv",
+        "jsonlite",
+        "later",
+        "lifecycle",
+        "methods",
+        "mime",
+        "promises",
+        "rlang",
+        "sourcetools",
+        "tools",
+        "utils",
+        "withr",
+        "xtable"
+      ],
+      "Hash": "6a293995a66e12c48d13aa1f957d09c7"
+    },
+    "sourcetools": {
+      "Package": "sourcetools",
+      "Version": "0.1.7-1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R"
+      ],
+      "Hash": "5f5a7629f956619d519205ec475fe647"
+    },
     "sp": {
       "Package": "sp",
       "Version": "2.1-4",
@@ -2889,6 +3168,18 @@
       ],
       "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61"
     },
+    "xtable": {
+      "Package": "xtable",
+      "Version": "1.8-4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "stats",
+        "utils"
+      ],
+      "Hash": "b8acdf8af494d9ec19ccb2481a9b11c2"
+    },
     "yaml": {
       "Package": "yaml",
       "Version": "2.3.10",
-- 
GitLab


From aa163c9d326cadddf5a86a1b93e666c56cd85a9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Mon, 6 Jan 2025 11:41:34 +0100
Subject: [PATCH 4/7] updated presence preparation, fine tuned absence sampling

---
 R/01_02_raster_preparation.R   |  10 +--
 R/03_01_presence_preparation.R |  22 +++--
 R/03_02_absence_preparation.R  | 150 ++++++++++++++-------------------
 3 files changed, 84 insertions(+), 98 deletions(-)

diff --git a/R/01_02_raster_preparation.R b/R/01_02_raster_preparation.R
index fae113f..1d6951a 100644
--- a/R/01_02_raster_preparation.R
+++ b/R/01_02_raster_preparation.R
@@ -32,17 +32,17 @@ terra::writeRaster(igfc, filename = file.path("data", "geospatial", "raster", "m
 # Seasonal inundation ####
 igsw = list.files("I:/mas_data/00_data/processed/iGSW/", pattern = "seasonal_(199[0-9]|200[0-9]|2010)", full.names = TRUE) %>% 
   terra::rast() %>% 
-  terra::resample(chelsa) %>%   
-  app(mean, na.rm = TRUE)
+  terra::resample(chelsa, threads = 32) %>%   
+  app(mean, na.rm = TRUE) %>% 
+  subst(NA, 0)
 terra::set.names(igsw, "igsw")
 terra::writeRaster(igsw, filename = file.path("data", "geospatial", "raster", "mean_seasonalInundation_1992-2010.tif"), overwrite = T)
 
 # Distance to freshwater ####
 dtfw = list.files("I:/mas_data/00_data/processed/linearDistance/", pattern = "lake|reservoirs|river", full.names = TRUE) %>% 
   terra::rast() %>% 
-  crop(study_extent) %>% 
-  min(na.rm = T)  %>% 
-  terra::resample(chelsa)
+  terra::resample(chelsa) %>% 
+  app(mean, na.rm = TRUE)
 terra::set.names(dtfw, "dtfw")
 terra::writeRaster(dtfw, filename = file.path("data", "geospatial", "raster", "mean_distanceToWater.tif"), overwrite = T)
 
diff --git a/R/03_01_presence_preparation.R b/R/03_01_presence_preparation.R
index c8705b9..6344de7 100644
--- a/R/03_01_presence_preparation.R
+++ b/R/03_01_presence_preparation.R
@@ -20,14 +20,15 @@ sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry
 # ---------------------------------------------------------------------------#
 # Prepare Geodata                                                         ####
 # ---------------------------------------------------------------------------#
-# TODO: finalize predictor choice
-raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% 
-  stringr::str_sort(numeric = T)
-
 sa_polygon = rnaturalearth::ne_countries() %>% 
   dplyr::filter(continent == "South America") %>% 
   sf::st_union()
 
+raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% 
+  stringr::str_sort(numeric = T) 
+
+raster_data = terra::rast(raster_filepaths) 
+
 # ---------------------------------------------------------------------------#
 # Prepare Occurrence Data                                                 ####
 # ---------------------------------------------------------------------------#
@@ -60,9 +61,18 @@ occs_flagged = occs %>%
   dplyr::filter(.summary == T) %>% 
   dplyr::select(species, coordinate_id, longitude, latitude)
 
-# Finalize and save occurrence dataset
+# Subset species occurrences to validated records
 occs_final = occs %>% 
   inner_join(occs_flagged, by = c("species", "coordinate_id", "longitude", "latitude")) %>% 
   dplyr::select(-coordinate_id)
 
-save(occs_final, file = "data/r_objects/occs_final.RData")
\ No newline at end of file
+# Extract environmental data
+proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" 
+
+raster_data = terra::project(raster_data, proj_string)
+occs_final = st_transform(occs_final, proj_string)
+env_data = extract(raster_data, vect(occs_final), ID = F)
+
+# Save occurrences
+occs_final = bind_cols(occs_final, env_data)
+save(occs_final, file = "data/r_objects/occs_final.RData")
diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R
index 5a34407..6104ae6 100644
--- a/R/03_02_absence_preparation.R
+++ b/R/03_02_absence_preparation.R
@@ -13,10 +13,8 @@ library(geos)
 library(MASS)
 library(dismo)
 library(spatialEco)
-
-# DB packages
-library(Symobio)
-library(DBI)
+library(blockCV)
+library(caret)
 
 source("R/utils.R")
 
@@ -32,92 +30,80 @@ sa_polygon = rnaturalearth::ne_countries() %>%
   dplyr::filter(continent == "South America") %>% 
   sf::st_union() 
 
-raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice
+raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% 
+  #raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice
   stringr::str_sort(numeric = T) 
 
-raster_data = terra::rast(raster_filepaths) %>% 
-  terra::crop(sf::st_bbox(sa_polygon))
-
 # Project (proj string generated with SA bbox coordinates on https://projectionwizard.org)
 proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" 
 
-raster_data = terra::project(raster_data, proj_string)
+# raster_data = terra::project(raster_data, proj_string)
 sa_polygon = st_transform(sa_polygon, proj_string)
 occs_final = st_transform(occs_final, proj_string)
 range_maps = st_transform(range_maps, proj_string)
 
 # ---------------------------------------------------------------------------#
-# 1. Distance-based sampling                                              ####
+# Sample absences                                                         ####
 #                                                                            #
 # Absences should be predominantly sampled from regions that are             #
 # geographically close (reproduce sampling biases) but environmentally       #
 # dissimilar (avoid false negatives) to the known occurrences                #
 # ---------------------------------------------------------------------------#
-future::plan("multisession", workers = 16)
-pseudo_absences = furrr::future_map(
-  .x = target_species, 
-  .options = furrr::furrr_options(seed = 42), 
+future::plan("multisession", workers = 24)
+
+model_data = furrr::future_walk(
+  .x = target_species,
+  .options = furrr::furrr_options(seed = 42),
+  .env_globals = c(raster_filepaths, sa_polygon, occs_final, range_maps, proj_string),
   .f = function(species){
     # Prepare species occurrence data #####
     occs_spec = dplyr::filter(occs_final, species == !!species)
     
-    occs_combined = occs_spec %>% 
+    occs_multipoint = occs_spec %>% 
       summarise(geometry = st_combine(geometry))
     
-    occs_bff = occs_spec %>% 
-      st_buffer(10000) %>% # Buffer by 10 kilometers as exclusion radius for absence sampling
-      st_union()
-    
     range_spec = dplyr::filter(range_maps, name_matched == !!species) %>% 
       rename(species = name_matched)
     
     # Define model/sampling region
-    # 1. Union all point coordinates from range polygon and occurrences records
+    # 1. Union all points from range polygon and occurrences records
     # 2. Build concave polygon from unionen points
     # 3. Buffer by 100 km to extend sampling region into new environments
     # 4. Crop by South America outline
     sampling_region = st_cast(range_spec, "MULTIPOINT") %>% 
-      st_union(occs_combined) %>% 
+      st_union(occs_multipoint) %>% 
       geos::geos_concave_hull(ratio = 0.3) %>% # Concave hull in sf requires geos 3.11
       st_as_sf() %>% 
       st_set_crs(proj_string) %>% 
       st_buffer(dist = 100000) %>% 
       st_intersection(sa_polygon) 
     
+    # Load raster data 
+    raster_data = terra::rast(raster_filepaths) %>% 
+      terra::project(proj_string) %>% 
+      terra::crop(sf::st_bbox(sampling_region))
+    
     # Define spatial KDE
     ref = st_bbox(sampling_region)[c(1,3,2,4)]
     min_extent = min(ref[2]-ref[1], ref[4]-ref[3])
-    occs_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% 
+    spatial_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% 
       crop(sampling_region, mask = T, touches = F)
     
-    # Environmental KDE
-    env_sampling_region = raster::raster(terra::crop(raster_data, sampling_region, mask = T))
-    maxent_fit <- maxent(env_sampling_region, st_coordinates(occs_spec))
-
-# Step 3: Predict Across the Raster
-# Predict environmental similarity (continuous output)
-env_pred <- predict(maxent_model, env_sampling_region)
-    
-    
-    svm_model <- e1071::svm(x = env_spec,
-                            nu = 0.01,
-                            gamma = 0.00005,
-                            type = 'one-classification',
-                            kernel = "radial",
-                            scale = FALSE)
-    
-    env_pred <- terra::predict(env_sampling_region, fun = function(x) {
-      dnorm(x, mean = gmm_model$parameters$mean, sd = gmm_model$parameters$variance$sigma)
-    })
+    # Define reverse niche
+    env_sampling_region = terra::crop(raster_data, sampling_region, mask = T) %>% 
+      terra::resample(spatial_kde) %>% 
+      raster::stack()
     
-    env_pred = predict(env_sampling_region, svm_model, na.rm = T)
-    plot(env_pred)
+    maxent_fit <- maxent(env_sampling_region, st_coordinates(occs_spec))
+    env_kde <- (1-predict(env_sampling_region, maxent_fit)) %>% 
+      terra::rast()
     
-    # Sampling of pseudo absence
+    # Rejection sampling of pseudo absences
     abs_spec = st_sf(geometry = st_sfc(), crs = proj_string)
+    abs_prob = spatial_kde * env_kde
     while(nrow(abs_spec) < nrow(occs_spec)){
       sample_points_raw = st_sample(sampling_region, nrow(occs_spec))
-      p = extract(occs_kde, vect(sample_points_raw))$z
+      p = extract(abs_prob, vect(sample_points_raw))$z
       x = runif(length(p))
       
       sample_points = sample_points_raw[x<p]
@@ -127,42 +113,40 @@ env_pred <- predict(maxent_model, env_sampling_region)
         sample_points = sample(sample_points, samples_required)
       }
       
-      abs_spec = bind_rows(abs_spec, st_sf(geometry = sample_points)) 
+      abs_env = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE) %>% 
+        dplyr::mutate(
+          present = 0,
+          geometry = st_geometry(sample_points)
+        ) %>% 
+        drop_na()
+      
+      abs_spec = bind_rows(abs_spec, abs_env) 
     }
     
-    plot(sa_polygon)
-    #plot(occs_kde, add = T, legend = F)
-    plot(range_spec, add = T, col = "#00000000")
-    plot(occs_spec, add = T, col = "red", pch = 3, cex = 0.5)
-    plot(abs_spec, add = T, col = "black", pch = 3, cex = 0.5)
-    
-    
-    sample_points = st_as_sf(st_sample(sample_region, nrow(abs_spec))) %>% 
-      dplyr::mutate(
-        species = occs_spec$species[1],
-        longitude = sf::st_coordinates(.)[,1],
-        latitude = sf::st_coordinates(.)[,2]
-      ) %>% 
-      dplyr::rename(geometry = "x")
-    
-    abs_spec = terra::rast(raster_filepaths) %>% 
-      setNames(paste0("layer_", 1:19)) %>% 
-      terra::extract(sample_points) %>% 
-      dplyr::select(-ID) %>% 
-      dplyr::mutate(
-        present = 0,
-        geometry = sample_points$x
-      ) %>% 
-      tibble() %>% 
-      bind_cols(sample_points)
+    abs_coords = st_coordinates(st_transform(abs_spec, "EPSG:4326")) 
+    abs_spec = bind_cols(abs_spec, abs_coords) %>% 
+      dplyr::rename(longitude = X, latitude = Y) %>% 
+      dplyr::mutate(species = !!species)
     
     # Create presence-absence dataframe
     pa_spec = occs_spec %>% 
       dplyr::mutate(present = 1) %>% 
       bind_rows(abs_spec) 
     
+    # ggplot() +
+    #   ggtitle(species) +
+    #   geom_sf(data = st_crop(sa_polygon, ref), fill = "#756445") +
+    #   tidyterra::geom_spatraster(data = abs_prob) +
+    #   scale_fill_gradient(low="darkblue", high="green", na.value=NA, name = "Absence sampling probability") +
+    #   geom_sf(data = range_spec, alpha = 0, color = "black") +
+    #   geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) +
+    #   scale_color_discrete(name = "Presence") +
+    #   theme_minimal()
+    # 
+    # ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 2)
+    
     # Split into train and test datasets
-    train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE)
+    train_index = caret::createDataPartition(pa_spec$present, p = 0.7, list = FALSE)
     pa_spec$train = 0
     pa_spec$train[train_index] = 1
     
@@ -184,23 +168,15 @@ env_pred <- predict(maxent_model, env_sampling_region)
       NA
     })
     
-    pa_spec$folds = folds
+    pa_spec$fold = folds
     pa_spec$geometry = NULL
     
-    return(pa_spec)
+    save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData"))
   })
 
-model_data = bind_rows(model_data)
-save(model_data, file = "data/r_objects/model_data.RData")
-
-
-
+# Combine presence absence data
+files = list.files("data/r_objects/pa_sampling/", full.names = T)
+model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% 
+  bind_rows()
 
-ggplot() +
-  geom_sf(data = sa_polygon, fill = "red") +
-  geom_sf(data = range_spec, fill = "green") + 
-  geom_sf(data = occs_spec) +
-  #   facet_wrap("species", nrow = 2) + 
-  #   theme_minimal()
-  #   
-  
\ No newline at end of file
+save(model_data, file = "data/r_objects/model_data.RData")
\ No newline at end of file
-- 
GitLab


From 2f676f2723389feda85ba208cd17d7ff35e75fe4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Thu, 9 Jan 2025 11:08:45 +0100
Subject: [PATCH 5/7] some tweaks in presence and absence preparation

---
 R/03_01_presence_preparation.R |  3 ++-
 R/03_02_absence_preparation.R  | 16 +++++++---------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/R/03_01_presence_preparation.R b/R/03_01_presence_preparation.R
index 6344de7..9fb6ee3 100644
--- a/R/03_01_presence_preparation.R
+++ b/R/03_01_presence_preparation.R
@@ -74,5 +74,6 @@ occs_final = st_transform(occs_final, proj_string)
 env_data = extract(raster_data, vect(occs_final), ID = F)
 
 # Save occurrences
-occs_final = bind_cols(occs_final, env_data)
+occs_final = bind_cols(occs_final, env_data) %>% 
+  drop_na()
 save(occs_final, file = "data/r_objects/occs_final.RData")
diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R
index 6104ae6..951dded 100644
--- a/R/03_02_absence_preparation.R
+++ b/R/03_02_absence_preparation.R
@@ -103,12 +103,15 @@ model_data = furrr::future_walk(
     abs_prob = spatial_kde * env_kde
     while(nrow(abs_spec) < nrow(occs_spec)){
       sample_points_raw = st_sample(sampling_region, nrow(occs_spec))
-      p = extract(abs_prob, vect(sample_points_raw))$z
+      p = na.omit(extract(abs_prob, vect(sample_points_raw))$z)
       x = runif(length(p))
       
       sample_points = sample_points_raw[x<p]
+      if(length(sample_points) == 0){
+        next
+      }
+
       samples_required = nrow(occs_spec) - nrow(abs_spec)
-      
       if(length(sample_points) > samples_required){
         sample_points = sample(sample_points, samples_required)
       }
@@ -145,11 +148,6 @@ model_data = furrr::future_walk(
     # 
     # ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 2)
     
-    # Split into train and test datasets
-    train_index = caret::createDataPartition(pa_spec$present, p = 0.7, list = FALSE)
-    pa_spec$train = 0
-    pa_spec$train[train_index] = 1
-    
     # Define cross-validation folds
     folds = tryCatch({
       spatial_folds = suppressMessages(
@@ -174,9 +172,9 @@ model_data = furrr::future_walk(
     save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData"))
   })
 
-# Combine presence absence data
+# Combine presence-absence data
 files = list.files("data/r_objects/pa_sampling/", full.names = T)
 model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% 
   bind_rows()
 
-save(model_data, file = "data/r_objects/model_data.RData")
\ No newline at end of file
+save(model_data, file = "data/r_objects/pa_sampling/model_data.RData")
-- 
GitLab


From a72dfbf83cac22e6eabfc339807ea9a1e1f36f56 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Thu, 9 Jan 2025 12:41:59 +0100
Subject: [PATCH 6/7] filtered duplicate occurrences, auto bandwidth in spatial
 kde

---
 R/03_01_presence_preparation.R |  8 ++++++--
 R/03_02_absence_preparation.R  | 29 +++++++++++++++--------------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/R/03_01_presence_preparation.R b/R/03_01_presence_preparation.R
index 9fb6ee3..8d3fce3 100644
--- a/R/03_01_presence_preparation.R
+++ b/R/03_01_presence_preparation.R
@@ -73,7 +73,11 @@ raster_data = terra::project(raster_data, proj_string)
 occs_final = st_transform(occs_final, proj_string)
 env_data = extract(raster_data, vect(occs_final), ID = F)
 
-# Save occurrences
+# Merge data + final processing
 occs_final = bind_cols(occs_final, env_data) %>% 
-  drop_na()
+  drop_na() %>% # Remove records with NA env vars
+  group_by(species) %>% 
+  distinct()    # Remove duplicate records
+
+# Save occurrences
 save(occs_final, file = "data/r_objects/occs_final.RData")
diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R
index 951dded..2390512 100644
--- a/R/03_02_absence_preparation.R
+++ b/R/03_02_absence_preparation.R
@@ -85,8 +85,7 @@ model_data = furrr::future_walk(
     
     # Define spatial KDE
     ref = st_bbox(sampling_region)[c(1,3,2,4)]
-    min_extent = min(ref[2]-ref[1], ref[4]-ref[3])
-    spatial_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% 
+    spatial_kde = spatialEco::sf.kde(occs_spec, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% 
       crop(sampling_region, mask = T, touches = F)
     
     # Define reverse niche
@@ -110,7 +109,7 @@ model_data = furrr::future_walk(
       if(length(sample_points) == 0){
         next
       }
-
+      
       samples_required = nrow(occs_spec) - nrow(abs_spec)
       if(length(sample_points) > samples_required){
         sample_points = sample(sample_points, samples_required)
@@ -136,17 +135,19 @@ model_data = furrr::future_walk(
       dplyr::mutate(present = 1) %>% 
       bind_rows(abs_spec) 
     
-    # ggplot() +
-    #   ggtitle(species) +
-    #   geom_sf(data = st_crop(sa_polygon, ref), fill = "#756445") +
-    #   tidyterra::geom_spatraster(data = abs_prob) +
-    #   scale_fill_gradient(low="darkblue", high="green", na.value=NA, name = "Absence sampling probability") +
-    #   geom_sf(data = range_spec, alpha = 0, color = "black") +
-    #   geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) +
-    #   scale_color_discrete(name = "Presence") +
-    #   theme_minimal()
-    # 
-    # ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 2)
+    ggplot() +
+      ggtitle(species) +
+      geom_sf(data = st_crop(sa_polygon, ref), fill = "#756445") +
+      tidyterra::geom_spatraster(data = abs_prob) +
+      scale_fill_gradient(low="darkblue", high="green", na.value=NA, name = "Absence sampling probability") +
+      geom_sf(data = range_spec, alpha = 0, color = "black") +
+      geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) +
+      scale_color_discrete(name = "Presence") +
+      theme_minimal()
+    
+    try({
+      ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 2)
+    })
     
     # Define cross-validation folds
     folds = tryCatch({
-- 
GitLab


From e3da5ed40a04a46b91552915633f8ce3c67a26ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Fri, 10 Jan 2025 14:34:00 +0100
Subject: [PATCH 7/7] improved implementation of absence sampling algorithnm

---
 R/03_02_absence_preparation.R | 86 ++++++++++++++++++++++-------------
 1 file changed, 54 insertions(+), 32 deletions(-)

diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R
index 2390512..914ddb7 100644
--- a/R/03_02_absence_preparation.R
+++ b/R/03_02_absence_preparation.R
@@ -49,15 +49,19 @@ range_maps = st_transform(range_maps, proj_string)
 # geographically close (reproduce sampling biases) but environmentally       #
 # dissimilar (avoid false negatives) to the known occurrences                #
 # ---------------------------------------------------------------------------#
-future::plan("multisession", workers = 24)
+future::plan("multisession", workers = 16)
 
 model_data = furrr::future_walk(
   .x = target_species,
-  .options = furrr::furrr_options(seed = 42),
+  .options = furrr::furrr_options(seed = 42, scheduling = FALSE), # make sure workers get constant stream of work
   .env_globals = c(raster_filepaths, sa_polygon, occs_final, range_maps, proj_string),
   .f = function(species){
     # Prepare species occurrence data #####
     occs_spec = dplyr::filter(occs_final, species == !!species)
+    if(nrow(occs_spec) <= 1){
+      warning("Skipping species ", species, ": Can't process species with a single presence record." )
+      return()
+    }
     
     occs_multipoint = occs_spec %>% 
       summarise(geometry = st_combine(geometry))
@@ -85,7 +89,19 @@ model_data = furrr::future_walk(
     
     # Define spatial KDE
     ref = st_bbox(sampling_region)[c(1,3,2,4)]
-    spatial_kde = spatialEco::sf.kde(occs_spec, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% 
+    
+    # Function to estimate bandwidth 
+    # ---> this function is the default estimation of the sf.kde package but with a minimum of 50000 = 50km)
+    calc_bw <- function(x) {
+      r <- stats::quantile(x, c(0.25, 0.75))
+      h <- (r[2] - r[1])/1.34
+      result = 4 * 1.06 * min(sqrt(stats::var(x)), h) * length(x)^(-1/5)
+      return(max(result, 50000))
+    }
+    
+    bw_x = calc_bw(st_coordinates(occs_spec)[,"X"])
+    bw_y = calc_bw(st_coordinates(occs_spec)[,"Y"])
+    spatial_kde = spatialEco::sf.kde(occs_spec, bw = c(bw_x, bw_y), res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% 
       crop(sampling_region, mask = T, touches = F)
     
     # Define reverse niche
@@ -97,38 +113,43 @@ model_data = furrr::future_walk(
     env_kde <- (1-predict(env_sampling_region, maxent_fit)) %>% 
       terra::rast()
     
-    # Rejection sampling of pseudo absences
-    abs_spec = st_sf(geometry = st_sfc(), crs = proj_string)
+    # Sample pseudo absences
     abs_prob = spatial_kde * env_kde
-    while(nrow(abs_spec) < nrow(occs_spec)){
-      sample_points_raw = st_sample(sampling_region, nrow(occs_spec))
-      p = na.omit(extract(abs_prob, vect(sample_points_raw))$z)
-      x = runif(length(p))
-      
-      sample_points = sample_points_raw[x<p]
-      if(length(sample_points) == 0){
-        next
-      }
-      
-      samples_required = nrow(occs_spec) - nrow(abs_spec)
-      if(length(sample_points) > samples_required){
-        sample_points = sample(sample_points, samples_required)
-      }
+    abs_spec_list = list()
+    
+    samples_required = nrow(occs_spec)
+    
+    while(samples_required != 0){
+      sample_points = spatSample(abs_prob, size = samples_required, method = "weights", xy = T, as.df = T) %>% 
+        dplyr::select(-z) %>% 
+        dplyr::mutate(x = x + runif(nrow(.), -5000, 5000),         # Add jitter (res/2) to cell centroids 
+                      y = y + runif(nrow(.), -5000, 5000)) %>%     # Add jitter (res/2) to cell centroids 
+        st_as_sf(coords = c("x", "y"), crs = proj_string)
       
-      abs_env = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE) %>% 
-        dplyr::mutate(
-          present = 0,
-          geometry = st_geometry(sample_points)
-        ) %>% 
+      sample_abs = bind_cols(
+        terra::extract(raster_data, terra::vect(sample_points), ID = FALSE)
+      ) %>% 
+        bind_cols(sample_points) %>% 
         drop_na()
       
-      abs_spec = bind_rows(abs_spec, abs_env) 
+      abs_spec_list[[length(abs_spec_list)+1]] = sample_points
+      
+      samples_required = samples_required - nrow(sample_points) # Sometimes there are no env data for sample points, so keep sampling
     }
     
-    abs_coords = st_coordinates(st_transform(abs_spec, "EPSG:4326")) 
-    abs_spec = bind_cols(abs_spec, abs_coords) %>% 
-      dplyr::rename(longitude = X, latitude = Y) %>% 
-      dplyr::mutate(species = !!species)
+    abs_spec = bind_rows(abs_spec_list)
+    abs_spec = abs_spec %>% 
+      bind_cols(
+        st_coordinates(st_transform(abs_spec, "EPSG:4326")) 
+      ) %>% 
+      dplyr::rename(
+        longitude = X,
+        latitude = Y
+      ) %>% 
+      dplyr::mutate(
+        species = !!species,
+        present = 0
+      ) 
     
     # Create presence-absence dataframe
     pa_spec = occs_spec %>% 
@@ -167,15 +188,16 @@ model_data = furrr::future_walk(
       NA
     })
     
-    pa_spec$fold = folds
+    pa_spec$fold_eval = folds
     pa_spec$geometry = NULL
     
     save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData"))
-  })
+  }
+)
 
 # Combine presence-absence data
 files = list.files("data/r_objects/pa_sampling/", full.names = T)
 model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% 
   bind_rows()
 
-save(model_data, file = "data/r_objects/pa_sampling/model_data.RData")
+save(model_data, file = "data/r_objects/model_data_pa_sampling.RData")
-- 
GitLab