From aa163c9d326cadddf5a86a1b93e666c56cd85a9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Mon, 6 Jan 2025 11:41:34 +0100 Subject: [PATCH] updated presence preparation, fine tuned absence sampling --- R/01_02_raster_preparation.R | 10 +-- R/03_01_presence_preparation.R | 22 +++-- R/03_02_absence_preparation.R | 150 ++++++++++++++------------------- 3 files changed, 84 insertions(+), 98 deletions(-) diff --git a/R/01_02_raster_preparation.R b/R/01_02_raster_preparation.R index fae113f..1d6951a 100644 --- a/R/01_02_raster_preparation.R +++ b/R/01_02_raster_preparation.R @@ -32,17 +32,17 @@ terra::writeRaster(igfc, filename = file.path("data", "geospatial", "raster", "m # Seasonal inundation #### igsw = list.files("I:/mas_data/00_data/processed/iGSW/", pattern = "seasonal_(199[0-9]|200[0-9]|2010)", full.names = TRUE) %>% terra::rast() %>% - terra::resample(chelsa) %>% - app(mean, na.rm = TRUE) + terra::resample(chelsa, threads = 32) %>% + app(mean, na.rm = TRUE) %>% + subst(NA, 0) terra::set.names(igsw, "igsw") terra::writeRaster(igsw, filename = file.path("data", "geospatial", "raster", "mean_seasonalInundation_1992-2010.tif"), overwrite = T) # Distance to freshwater #### dtfw = list.files("I:/mas_data/00_data/processed/linearDistance/", pattern = "lake|reservoirs|river", full.names = TRUE) %>% terra::rast() %>% - crop(study_extent) %>% - min(na.rm = T) %>% - terra::resample(chelsa) + terra::resample(chelsa) %>% + app(mean, na.rm = TRUE) terra::set.names(dtfw, "dtfw") terra::writeRaster(dtfw, filename = file.path("data", "geospatial", "raster", "mean_distanceToWater.tif"), overwrite = T) diff --git a/R/03_01_presence_preparation.R b/R/03_01_presence_preparation.R index c8705b9..6344de7 100644 --- a/R/03_01_presence_preparation.R +++ b/R/03_01_presence_preparation.R @@ -20,14 +20,15 @@ sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry # ---------------------------------------------------------------------------# # Prepare Geodata #### # ---------------------------------------------------------------------------# -# TODO: finalize predictor choice -raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% - stringr::str_sort(numeric = T) - sa_polygon = rnaturalearth::ne_countries() %>% dplyr::filter(continent == "South America") %>% sf::st_union() +raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% + stringr::str_sort(numeric = T) + +raster_data = terra::rast(raster_filepaths) + # ---------------------------------------------------------------------------# # Prepare Occurrence Data #### # ---------------------------------------------------------------------------# @@ -60,9 +61,18 @@ occs_flagged = occs %>% dplyr::filter(.summary == T) %>% dplyr::select(species, coordinate_id, longitude, latitude) -# Finalize and save occurrence dataset +# Subset species occurrences to validated records occs_final = occs %>% inner_join(occs_flagged, by = c("species", "coordinate_id", "longitude", "latitude")) %>% dplyr::select(-coordinate_id) -save(occs_final, file = "data/r_objects/occs_final.RData") \ No newline at end of file +# Extract environmental data +proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" + +raster_data = terra::project(raster_data, proj_string) +occs_final = st_transform(occs_final, proj_string) +env_data = extract(raster_data, vect(occs_final), ID = F) + +# Save occurrences +occs_final = bind_cols(occs_final, env_data) +save(occs_final, file = "data/r_objects/occs_final.RData") diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R index 5a34407..6104ae6 100644 --- a/R/03_02_absence_preparation.R +++ b/R/03_02_absence_preparation.R @@ -13,10 +13,8 @@ library(geos) library(MASS) library(dismo) library(spatialEco) - -# DB packages -library(Symobio) -library(DBI) +library(blockCV) +library(caret) source("R/utils.R") @@ -32,92 +30,80 @@ sa_polygon = rnaturalearth::ne_countries() %>% dplyr::filter(continent == "South America") %>% sf::st_union() -raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice +raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% + #raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice stringr::str_sort(numeric = T) -raster_data = terra::rast(raster_filepaths) %>% - terra::crop(sf::st_bbox(sa_polygon)) - # Project (proj string generated with SA bbox coordinates on https://projectionwizard.org) proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" -raster_data = terra::project(raster_data, proj_string) +# raster_data = terra::project(raster_data, proj_string) sa_polygon = st_transform(sa_polygon, proj_string) occs_final = st_transform(occs_final, proj_string) range_maps = st_transform(range_maps, proj_string) # ---------------------------------------------------------------------------# -# 1. Distance-based sampling #### +# Sample absences #### # # # Absences should be predominantly sampled from regions that are # # geographically close (reproduce sampling biases) but environmentally # # dissimilar (avoid false negatives) to the known occurrences # # ---------------------------------------------------------------------------# -future::plan("multisession", workers = 16) -pseudo_absences = furrr::future_map( - .x = target_species, - .options = furrr::furrr_options(seed = 42), +future::plan("multisession", workers = 24) + +model_data = furrr::future_walk( + .x = target_species, + .options = furrr::furrr_options(seed = 42), + .env_globals = c(raster_filepaths, sa_polygon, occs_final, range_maps, proj_string), .f = function(species){ # Prepare species occurrence data ##### occs_spec = dplyr::filter(occs_final, species == !!species) - occs_combined = occs_spec %>% + occs_multipoint = occs_spec %>% summarise(geometry = st_combine(geometry)) - occs_bff = occs_spec %>% - st_buffer(10000) %>% # Buffer by 10 kilometers as exclusion radius for absence sampling - st_union() - range_spec = dplyr::filter(range_maps, name_matched == !!species) %>% rename(species = name_matched) # Define model/sampling region - # 1. Union all point coordinates from range polygon and occurrences records + # 1. Union all points from range polygon and occurrences records # 2. Build concave polygon from unionen points # 3. Buffer by 100 km to extend sampling region into new environments # 4. Crop by South America outline sampling_region = st_cast(range_spec, "MULTIPOINT") %>% - st_union(occs_combined) %>% + st_union(occs_multipoint) %>% geos::geos_concave_hull(ratio = 0.3) %>% # Concave hull in sf requires geos 3.11 st_as_sf() %>% st_set_crs(proj_string) %>% st_buffer(dist = 100000) %>% st_intersection(sa_polygon) + # Load raster data + raster_data = terra::rast(raster_filepaths) %>% + terra::project(proj_string) %>% + terra::crop(sf::st_bbox(sampling_region)) + # Define spatial KDE ref = st_bbox(sampling_region)[c(1,3,2,4)] min_extent = min(ref[2]-ref[1], ref[4]-ref[3]) - occs_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% + spatial_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% crop(sampling_region, mask = T, touches = F) - # Environmental KDE - env_sampling_region = raster::raster(terra::crop(raster_data, sampling_region, mask = T)) - maxent_fit <- maxent(env_sampling_region, st_coordinates(occs_spec)) - -# Step 3: Predict Across the Raster -# Predict environmental similarity (continuous output) -env_pred <- predict(maxent_model, env_sampling_region) - - - svm_model <- e1071::svm(x = env_spec, - nu = 0.01, - gamma = 0.00005, - type = 'one-classification', - kernel = "radial", - scale = FALSE) - - env_pred <- terra::predict(env_sampling_region, fun = function(x) { - dnorm(x, mean = gmm_model$parameters$mean, sd = gmm_model$parameters$variance$sigma) - }) + # Define reverse niche + env_sampling_region = terra::crop(raster_data, sampling_region, mask = T) %>% + terra::resample(spatial_kde) %>% + raster::stack() - env_pred = predict(env_sampling_region, svm_model, na.rm = T) - plot(env_pred) + maxent_fit <- maxent(env_sampling_region, st_coordinates(occs_spec)) + env_kde <- (1-predict(env_sampling_region, maxent_fit)) %>% + terra::rast() - # Sampling of pseudo absence + # Rejection sampling of pseudo absences abs_spec = st_sf(geometry = st_sfc(), crs = proj_string) + abs_prob = spatial_kde * env_kde while(nrow(abs_spec) < nrow(occs_spec)){ sample_points_raw = st_sample(sampling_region, nrow(occs_spec)) - p = extract(occs_kde, vect(sample_points_raw))$z + p = extract(abs_prob, vect(sample_points_raw))$z x = runif(length(p)) sample_points = sample_points_raw[x<p] @@ -127,42 +113,40 @@ env_pred <- predict(maxent_model, env_sampling_region) sample_points = sample(sample_points, samples_required) } - abs_spec = bind_rows(abs_spec, st_sf(geometry = sample_points)) + abs_env = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE) %>% + dplyr::mutate( + present = 0, + geometry = st_geometry(sample_points) + ) %>% + drop_na() + + abs_spec = bind_rows(abs_spec, abs_env) } - plot(sa_polygon) - #plot(occs_kde, add = T, legend = F) - plot(range_spec, add = T, col = "#00000000") - plot(occs_spec, add = T, col = "red", pch = 3, cex = 0.5) - plot(abs_spec, add = T, col = "black", pch = 3, cex = 0.5) - - - sample_points = st_as_sf(st_sample(sample_region, nrow(abs_spec))) %>% - dplyr::mutate( - species = occs_spec$species[1], - longitude = sf::st_coordinates(.)[,1], - latitude = sf::st_coordinates(.)[,2] - ) %>% - dplyr::rename(geometry = "x") - - abs_spec = terra::rast(raster_filepaths) %>% - setNames(paste0("layer_", 1:19)) %>% - terra::extract(sample_points) %>% - dplyr::select(-ID) %>% - dplyr::mutate( - present = 0, - geometry = sample_points$x - ) %>% - tibble() %>% - bind_cols(sample_points) + abs_coords = st_coordinates(st_transform(abs_spec, "EPSG:4326")) + abs_spec = bind_cols(abs_spec, abs_coords) %>% + dplyr::rename(longitude = X, latitude = Y) %>% + dplyr::mutate(species = !!species) # Create presence-absence dataframe pa_spec = occs_spec %>% dplyr::mutate(present = 1) %>% bind_rows(abs_spec) + # ggplot() + + # ggtitle(species) + + # geom_sf(data = st_crop(sa_polygon, ref), fill = "#756445") + + # tidyterra::geom_spatraster(data = abs_prob) + + # scale_fill_gradient(low="darkblue", high="green", na.value=NA, name = "Absence sampling probability") + + # geom_sf(data = range_spec, alpha = 0, color = "black") + + # geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) + + # scale_color_discrete(name = "Presence") + + # theme_minimal() + # + # ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 2) + # Split into train and test datasets - train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE) + train_index = caret::createDataPartition(pa_spec$present, p = 0.7, list = FALSE) pa_spec$train = 0 pa_spec$train[train_index] = 1 @@ -184,23 +168,15 @@ env_pred <- predict(maxent_model, env_sampling_region) NA }) - pa_spec$folds = folds + pa_spec$fold = folds pa_spec$geometry = NULL - return(pa_spec) + save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData")) }) -model_data = bind_rows(model_data) -save(model_data, file = "data/r_objects/model_data.RData") - - - +# Combine presence absence data +files = list.files("data/r_objects/pa_sampling/", full.names = T) +model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% + bind_rows() -ggplot() + - geom_sf(data = sa_polygon, fill = "red") + - geom_sf(data = range_spec, fill = "green") + - geom_sf(data = occs_spec) + - # facet_wrap("species", nrow = 2) + - # theme_minimal() - # - \ No newline at end of file +save(model_data, file = "data/r_objects/model_data.RData") \ No newline at end of file -- GitLab