From aa163c9d326cadddf5a86a1b93e666c56cd85a9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Mon, 6 Jan 2025 11:41:34 +0100
Subject: [PATCH] updated presence preparation, fine tuned absence sampling

---
 R/01_02_raster_preparation.R   |  10 +--
 R/03_01_presence_preparation.R |  22 +++--
 R/03_02_absence_preparation.R  | 150 ++++++++++++++-------------------
 3 files changed, 84 insertions(+), 98 deletions(-)

diff --git a/R/01_02_raster_preparation.R b/R/01_02_raster_preparation.R
index fae113f..1d6951a 100644
--- a/R/01_02_raster_preparation.R
+++ b/R/01_02_raster_preparation.R
@@ -32,17 +32,17 @@ terra::writeRaster(igfc, filename = file.path("data", "geospatial", "raster", "m
 # Seasonal inundation ####
 igsw = list.files("I:/mas_data/00_data/processed/iGSW/", pattern = "seasonal_(199[0-9]|200[0-9]|2010)", full.names = TRUE) %>% 
   terra::rast() %>% 
-  terra::resample(chelsa) %>%   
-  app(mean, na.rm = TRUE)
+  terra::resample(chelsa, threads = 32) %>%   
+  app(mean, na.rm = TRUE) %>% 
+  subst(NA, 0)
 terra::set.names(igsw, "igsw")
 terra::writeRaster(igsw, filename = file.path("data", "geospatial", "raster", "mean_seasonalInundation_1992-2010.tif"), overwrite = T)
 
 # Distance to freshwater ####
 dtfw = list.files("I:/mas_data/00_data/processed/linearDistance/", pattern = "lake|reservoirs|river", full.names = TRUE) %>% 
   terra::rast() %>% 
-  crop(study_extent) %>% 
-  min(na.rm = T)  %>% 
-  terra::resample(chelsa)
+  terra::resample(chelsa) %>% 
+  app(mean, na.rm = TRUE)
 terra::set.names(dtfw, "dtfw")
 terra::writeRaster(dtfw, filename = file.path("data", "geospatial", "raster", "mean_distanceToWater.tif"), overwrite = T)
 
diff --git a/R/03_01_presence_preparation.R b/R/03_01_presence_preparation.R
index c8705b9..6344de7 100644
--- a/R/03_01_presence_preparation.R
+++ b/R/03_01_presence_preparation.R
@@ -20,14 +20,15 @@ sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry
 # ---------------------------------------------------------------------------#
 # Prepare Geodata                                                         ####
 # ---------------------------------------------------------------------------#
-# TODO: finalize predictor choice
-raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% 
-  stringr::str_sort(numeric = T)
-
 sa_polygon = rnaturalearth::ne_countries() %>% 
   dplyr::filter(continent == "South America") %>% 
   sf::st_union()
 
+raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% 
+  stringr::str_sort(numeric = T) 
+
+raster_data = terra::rast(raster_filepaths) 
+
 # ---------------------------------------------------------------------------#
 # Prepare Occurrence Data                                                 ####
 # ---------------------------------------------------------------------------#
@@ -60,9 +61,18 @@ occs_flagged = occs %>%
   dplyr::filter(.summary == T) %>% 
   dplyr::select(species, coordinate_id, longitude, latitude)
 
-# Finalize and save occurrence dataset
+# Subset species occurrences to validated records
 occs_final = occs %>% 
   inner_join(occs_flagged, by = c("species", "coordinate_id", "longitude", "latitude")) %>% 
   dplyr::select(-coordinate_id)
 
-save(occs_final, file = "data/r_objects/occs_final.RData")
\ No newline at end of file
+# Extract environmental data
+proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" 
+
+raster_data = terra::project(raster_data, proj_string)
+occs_final = st_transform(occs_final, proj_string)
+env_data = extract(raster_data, vect(occs_final), ID = F)
+
+# Save occurrences
+occs_final = bind_cols(occs_final, env_data)
+save(occs_final, file = "data/r_objects/occs_final.RData")
diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R
index 5a34407..6104ae6 100644
--- a/R/03_02_absence_preparation.R
+++ b/R/03_02_absence_preparation.R
@@ -13,10 +13,8 @@ library(geos)
 library(MASS)
 library(dismo)
 library(spatialEco)
-
-# DB packages
-library(Symobio)
-library(DBI)
+library(blockCV)
+library(caret)
 
 source("R/utils.R")
 
@@ -32,92 +30,80 @@ sa_polygon = rnaturalearth::ne_countries() %>%
   dplyr::filter(continent == "South America") %>% 
   sf::st_union() 
 
-raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice
+raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% 
+  #raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice
   stringr::str_sort(numeric = T) 
 
-raster_data = terra::rast(raster_filepaths) %>% 
-  terra::crop(sf::st_bbox(sa_polygon))
-
 # Project (proj string generated with SA bbox coordinates on https://projectionwizard.org)
 proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" 
 
-raster_data = terra::project(raster_data, proj_string)
+# raster_data = terra::project(raster_data, proj_string)
 sa_polygon = st_transform(sa_polygon, proj_string)
 occs_final = st_transform(occs_final, proj_string)
 range_maps = st_transform(range_maps, proj_string)
 
 # ---------------------------------------------------------------------------#
-# 1. Distance-based sampling                                              ####
+# Sample absences                                                         ####
 #                                                                            #
 # Absences should be predominantly sampled from regions that are             #
 # geographically close (reproduce sampling biases) but environmentally       #
 # dissimilar (avoid false negatives) to the known occurrences                #
 # ---------------------------------------------------------------------------#
-future::plan("multisession", workers = 16)
-pseudo_absences = furrr::future_map(
-  .x = target_species, 
-  .options = furrr::furrr_options(seed = 42), 
+future::plan("multisession", workers = 24)
+
+model_data = furrr::future_walk(
+  .x = target_species,
+  .options = furrr::furrr_options(seed = 42),
+  .env_globals = c(raster_filepaths, sa_polygon, occs_final, range_maps, proj_string),
   .f = function(species){
     # Prepare species occurrence data #####
     occs_spec = dplyr::filter(occs_final, species == !!species)
     
-    occs_combined = occs_spec %>% 
+    occs_multipoint = occs_spec %>% 
       summarise(geometry = st_combine(geometry))
     
-    occs_bff = occs_spec %>% 
-      st_buffer(10000) %>% # Buffer by 10 kilometers as exclusion radius for absence sampling
-      st_union()
-    
     range_spec = dplyr::filter(range_maps, name_matched == !!species) %>% 
       rename(species = name_matched)
     
     # Define model/sampling region
-    # 1. Union all point coordinates from range polygon and occurrences records
+    # 1. Union all points from range polygon and occurrences records
     # 2. Build concave polygon from unionen points
     # 3. Buffer by 100 km to extend sampling region into new environments
     # 4. Crop by South America outline
     sampling_region = st_cast(range_spec, "MULTIPOINT") %>% 
-      st_union(occs_combined) %>% 
+      st_union(occs_multipoint) %>% 
       geos::geos_concave_hull(ratio = 0.3) %>% # Concave hull in sf requires geos 3.11
       st_as_sf() %>% 
       st_set_crs(proj_string) %>% 
       st_buffer(dist = 100000) %>% 
       st_intersection(sa_polygon) 
     
+    # Load raster data 
+    raster_data = terra::rast(raster_filepaths) %>% 
+      terra::project(proj_string) %>% 
+      terra::crop(sf::st_bbox(sampling_region))
+    
     # Define spatial KDE
     ref = st_bbox(sampling_region)[c(1,3,2,4)]
     min_extent = min(ref[2]-ref[1], ref[4]-ref[3])
-    occs_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% 
+    spatial_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% 
       crop(sampling_region, mask = T, touches = F)
     
-    # Environmental KDE
-    env_sampling_region = raster::raster(terra::crop(raster_data, sampling_region, mask = T))
-    maxent_fit <- maxent(env_sampling_region, st_coordinates(occs_spec))
-
-# Step 3: Predict Across the Raster
-# Predict environmental similarity (continuous output)
-env_pred <- predict(maxent_model, env_sampling_region)
-    
-    
-    svm_model <- e1071::svm(x = env_spec,
-                            nu = 0.01,
-                            gamma = 0.00005,
-                            type = 'one-classification',
-                            kernel = "radial",
-                            scale = FALSE)
-    
-    env_pred <- terra::predict(env_sampling_region, fun = function(x) {
-      dnorm(x, mean = gmm_model$parameters$mean, sd = gmm_model$parameters$variance$sigma)
-    })
+    # Define reverse niche
+    env_sampling_region = terra::crop(raster_data, sampling_region, mask = T) %>% 
+      terra::resample(spatial_kde) %>% 
+      raster::stack()
     
-    env_pred = predict(env_sampling_region, svm_model, na.rm = T)
-    plot(env_pred)
+    maxent_fit <- maxent(env_sampling_region, st_coordinates(occs_spec))
+    env_kde <- (1-predict(env_sampling_region, maxent_fit)) %>% 
+      terra::rast()
     
-    # Sampling of pseudo absence
+    # Rejection sampling of pseudo absences
     abs_spec = st_sf(geometry = st_sfc(), crs = proj_string)
+    abs_prob = spatial_kde * env_kde
     while(nrow(abs_spec) < nrow(occs_spec)){
       sample_points_raw = st_sample(sampling_region, nrow(occs_spec))
-      p = extract(occs_kde, vect(sample_points_raw))$z
+      p = extract(abs_prob, vect(sample_points_raw))$z
       x = runif(length(p))
       
       sample_points = sample_points_raw[x<p]
@@ -127,42 +113,40 @@ env_pred <- predict(maxent_model, env_sampling_region)
         sample_points = sample(sample_points, samples_required)
       }
       
-      abs_spec = bind_rows(abs_spec, st_sf(geometry = sample_points)) 
+      abs_env = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE) %>% 
+        dplyr::mutate(
+          present = 0,
+          geometry = st_geometry(sample_points)
+        ) %>% 
+        drop_na()
+      
+      abs_spec = bind_rows(abs_spec, abs_env) 
     }
     
-    plot(sa_polygon)
-    #plot(occs_kde, add = T, legend = F)
-    plot(range_spec, add = T, col = "#00000000")
-    plot(occs_spec, add = T, col = "red", pch = 3, cex = 0.5)
-    plot(abs_spec, add = T, col = "black", pch = 3, cex = 0.5)
-    
-    
-    sample_points = st_as_sf(st_sample(sample_region, nrow(abs_spec))) %>% 
-      dplyr::mutate(
-        species = occs_spec$species[1],
-        longitude = sf::st_coordinates(.)[,1],
-        latitude = sf::st_coordinates(.)[,2]
-      ) %>% 
-      dplyr::rename(geometry = "x")
-    
-    abs_spec = terra::rast(raster_filepaths) %>% 
-      setNames(paste0("layer_", 1:19)) %>% 
-      terra::extract(sample_points) %>% 
-      dplyr::select(-ID) %>% 
-      dplyr::mutate(
-        present = 0,
-        geometry = sample_points$x
-      ) %>% 
-      tibble() %>% 
-      bind_cols(sample_points)
+    abs_coords = st_coordinates(st_transform(abs_spec, "EPSG:4326")) 
+    abs_spec = bind_cols(abs_spec, abs_coords) %>% 
+      dplyr::rename(longitude = X, latitude = Y) %>% 
+      dplyr::mutate(species = !!species)
     
     # Create presence-absence dataframe
     pa_spec = occs_spec %>% 
       dplyr::mutate(present = 1) %>% 
       bind_rows(abs_spec) 
     
+    # ggplot() +
+    #   ggtitle(species) +
+    #   geom_sf(data = st_crop(sa_polygon, ref), fill = "#756445") +
+    #   tidyterra::geom_spatraster(data = abs_prob) +
+    #   scale_fill_gradient(low="darkblue", high="green", na.value=NA, name = "Absence sampling probability") +
+    #   geom_sf(data = range_spec, alpha = 0, color = "black") +
+    #   geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) +
+    #   scale_color_discrete(name = "Presence") +
+    #   theme_minimal()
+    # 
+    # ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 2)
+    
     # Split into train and test datasets
-    train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE)
+    train_index = caret::createDataPartition(pa_spec$present, p = 0.7, list = FALSE)
     pa_spec$train = 0
     pa_spec$train[train_index] = 1
     
@@ -184,23 +168,15 @@ env_pred <- predict(maxent_model, env_sampling_region)
       NA
     })
     
-    pa_spec$folds = folds
+    pa_spec$fold = folds
     pa_spec$geometry = NULL
     
-    return(pa_spec)
+    save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData"))
   })
 
-model_data = bind_rows(model_data)
-save(model_data, file = "data/r_objects/model_data.RData")
-
-
-
+# Combine presence absence data
+files = list.files("data/r_objects/pa_sampling/", full.names = T)
+model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% 
+  bind_rows()
 
-ggplot() +
-  geom_sf(data = sa_polygon, fill = "red") +
-  geom_sf(data = range_spec, fill = "green") + 
-  geom_sf(data = occs_spec) +
-  #   facet_wrap("species", nrow = 2) + 
-  #   theme_minimal()
-  #   
-  
\ No newline at end of file
+save(model_data, file = "data/r_objects/model_data.RData")
\ No newline at end of file
-- 
GitLab