From c33c0424fc345f738efc284ba5b47dfde9643702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Wed, 18 Dec 2024 22:27:33 +0100 Subject: [PATCH 1/7] protoyped kde based absence sampling --- R/03_01_presence_preparation.R | 68 +++++++++++ R/03_02_absence_preparation.R | 174 ++++++++++++++++++++++++++++ R/03_presence_absence_preparation.R | 160 ------------------------- 3 files changed, 242 insertions(+), 160 deletions(-) create mode 100644 R/03_01_presence_preparation.R create mode 100644 R/03_02_absence_preparation.R delete mode 100644 R/03_presence_absence_preparation.R diff --git a/R/03_01_presence_preparation.R b/R/03_01_presence_preparation.R new file mode 100644 index 0000000..c8705b9 --- /dev/null +++ b/R/03_01_presence_preparation.R @@ -0,0 +1,68 @@ +# General packages +library(dplyr) +library(tidyr) +library(furrr) + +# Geo packages +library(terra) +library(CoordinateCleaner) +library(sf) + +# DB packages +library(Symobio) +library(DBI) + +source("R/utils.R") + +con = db_connect() # Connection to Symobio +sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry + +# ---------------------------------------------------------------------------# +# Prepare Geodata #### +# ---------------------------------------------------------------------------# +# TODO: finalize predictor choice +raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% + stringr::str_sort(numeric = T) + +sa_polygon = rnaturalearth::ne_countries() %>% + dplyr::filter(continent == "South America") %>% + sf::st_union() + +# ---------------------------------------------------------------------------# +# Prepare Occurrence Data #### +# ---------------------------------------------------------------------------# +load("data/r_objects/range_maps.RData") +target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)]) + +# Query species from Symobio +occs = tbl(con, "species_occurrences") %>% + dplyr::filter(species %in% target_species) %>% + dplyr::select(-year) %>% + dplyr::distinct() %>% + collect() %>% + sf::st_as_sf(coords = c("longitude", "latitude"), remove = F, crs = sf::st_crs(4326)) %>% + sf::st_filter(sa_polygon) + +# Flag occurrences using Coordinate cleaner +occs_flagged = occs %>% + dplyr::distinct(species, coordinate_id, longitude, latitude) %>% + group_by(species) %>% + group_split() %>% + purrr::map( # Loop over species individually due to bug in CoordinateCleaner + CoordinateCleaner::clean_coordinates, + lon = "longitude", + lat = "latitude", + tests = c("centroids", "gbif", "equal", "institutions", "outliers"), + outliers_method = "quantile", + verbose = F + ) %>% + bind_rows() %>% + dplyr::filter(.summary == T) %>% + dplyr::select(species, coordinate_id, longitude, latitude) + +# Finalize and save occurrence dataset +occs_final = occs %>% + inner_join(occs_flagged, by = c("species", "coordinate_id", "longitude", "latitude")) %>% + dplyr::select(-coordinate_id) + +save(occs_final, file = "data/r_objects/occs_final.RData") \ No newline at end of file diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R new file mode 100644 index 0000000..b72d132 --- /dev/null +++ b/R/03_02_absence_preparation.R @@ -0,0 +1,174 @@ +# General packages +library(dplyr) +library(tidyr) +library(furrr) + +# Geo packages +library(terra) +library(CoordinateCleaner) +library(sf) +library(MASS) +library(geos) + +# DB packages +library(Symobio) +library(DBI) + +source("R/utils.R") + +load("data/r_objects/range_maps.RData") +load("data/r_objects/occs_final.RData") + +# ---------------------------------------------------------------------------# +# Prepare Geodata #### +# ---------------------------------------------------------------------------# +target_species = unique(occs_final$species) + +sa_polygon = rnaturalearth::ne_countries() %>% + dplyr::filter(continent == "South America") %>% + sf::st_union() + +raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice + stringr::str_sort(numeric = T) + +raster_data = terra::rast(raster_filepaths) %>% + terra::crop(sf::st_bbox(sa_polygon)) + +# Project (proj string generated with SA bbox coordinates on https://projectionwizard.org) +proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" + +raster_data = terra::project(raster_data, proj_string) +sa_polygon = st_transform(sa_polygon, proj_string) +occs_final = st_transform(occs_final, proj_string) +range_maps = st_transform(range_maps, proj_string) + +# ---------------------------------------------------------------------------# +# 1. Distance-based sampling #### +# # +# Absences should be predominantly sampled from regions that are # +# geographically close (reproduce sampling biases) but environmentally # +# dissimilar (avoid false negatives) to the known occurrences # +# ---------------------------------------------------------------------------# +# Geographic distance +future::plan("multisession", workers = 16) +pseudo_absences = furrr::future_map( + .x = target_species, + .options = furrr::furrr_options(seed = 42), + .f = function(species){ + # Prepare occurrences + occs_spec = dplyr::filter(occs_final, species == !!species) + + occs_poly = occs_spec %>% + summarise(geometry = st_combine(geometry)) %>% + st_concave_hull(ratio = 0.5) + + occs_bff = occs_spec %>% + st_buffer(10000) %>% # Buffer by 10 kilometers as exclusion radius for absence sampling + st_union() + + range_spec = dplyr::filter(range_maps, name_matched == !!species) %>% + rename(species = name_matched) + + # Define model/sampling region + sampling_region = st_union(range_spec, occs_poly) %>% # Union occs poly and expert range + st_buffer(dist = 100000) %>% # Buffer by 100 km to extend sampling region into new environments + st_intersection(sa_polygon) %>% # Crop to South America + st_difference(occs_bff) # Crop out buffered presences + + # Define KDE + ref = st_bbox(sampling_region)[c(1,3,2,4)] + min_extent = min(ref[2]-ref[1], ref[4]-ref[3]) + occs_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% + crop(sampling_region, mask = T, touches = F) + + # Sampling of pseudo absence + abs_spec = st_sf(geometry = st_sfc(), crs = proj_string) + while(nrow(abs_spec) < nrow(occs_spec)){ + sample_points_raw = st_sample(sampling_region, nrow(occs_spec)) + p = extract(occs_kde, vect(sample_points_raw))$z + x = runif(length(p)) + + sample_points = sample_points_raw[x<p] + samples_required = nrow(occs_spec) - nrow(abs_spec) + + if(length(sample_points) > samples_required){ + sample_points = sample(sample_points, samples_required) + } + + abs_spec = bind_rows(abs_spec, st_sf(geometry = sample_points)) + } + + plot(sa_polygon) + plot(occs_kde, add = T, legend = F) + plot(range_spec, add = T, col = "#00000000") + plot(abs_spec, add = T, col = "black", pch = 3, cex = 0.5) + plot(occs_spec, add = T, col = "white", pch = 3, cex = 0.5) + + + sample_points = st_as_sf(st_sample(sample_region, nrow(abs_spec))) %>% + dplyr::mutate( + species = occs_spec$species[1], + longitude = sf::st_coordinates(.)[,1], + latitude = sf::st_coordinates(.)[,2] + ) %>% + dplyr::rename(geometry = "x") + + abs_spec = terra::rast(raster_filepaths) %>% + setNames(paste0("layer_", 1:19)) %>% + terra::extract(sample_points) %>% + dplyr::select(-ID) %>% + dplyr::mutate( + present = 0, + geometry = sample_points$x + ) %>% + tibble() %>% + bind_cols(sample_points) + + # Create presence-absence dataframe + pa_spec = occs_spec %>% + dplyr::mutate(present = 1) %>% + bind_rows(abs_spec) + + # Split into train and test datasets + train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE) + pa_spec$train = 0 + pa_spec$train[train_index] = 1 + + # Define cross-validation folds + folds = tryCatch({ + spatial_folds = suppressMessages( + blockCV::cv_spatial( + pa_spec, + column = "present", + k = 5, + progress = F, plot = F, report = F + ) + ) + + spatial_folds$folds_ids + }, warning = function(w){ + NA + }, error = function(e){ + NA + }) + + pa_spec$folds = folds + pa_spec$geometry = NULL + + return(pa_spec) + }) + +model_data = bind_rows(model_data) +save(model_data, file = "data/r_objects/model_data.RData") + + + + +ggplot() + + geom_sf(data = sa_polygon, fill = "red") + + geom_sf(data = range_spec, fill = "green") + + geom_sf(data = occs_spec) + + # facet_wrap("species", nrow = 2) + + # theme_minimal() + # + \ No newline at end of file diff --git a/R/03_presence_absence_preparation.R b/R/03_presence_absence_preparation.R deleted file mode 100644 index 9fd817c..0000000 --- a/R/03_presence_absence_preparation.R +++ /dev/null @@ -1,160 +0,0 @@ -# General packages -library(dplyr) -library(tidyr) -library(furrr) - -# Geo packages -library(terra) -library(CoordinateCleaner) -library(sf) - -# DB packages -library(Symobio) -library(DBI) - -source("R/utils.R") - -con = db_connect() # Connection to Symobio -sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry - -# ---------------------------------------------------------------------------# -# Prepare Geodata #### -# ---------------------------------------------------------------------------# -raster_info = tbl(con, "datasets") %>% - dplyr::filter(stringr::str_detect(name, "CHELSA")) %>% - collect() - -# raster_filepaths = list.files(raster_info$raw_data_path, pattern = ".tif$", full.names = T) %>% -# stringr::str_sort(numeric = T) - -raster_filepaths = list.files("I:/mas_data/00_data/processed/CHELSA_v2-1_bioclim", pattern = ".tif$", full.names = T) %>% - stringr::str_sort(numeric = T) - -sa_polygon = rnaturalearth::ne_countries() %>% - dplyr::filter(continent == "South America") %>% - sf::st_union() - -# ---------------------------------------------------------------------------# -# Prepare Occurrence Data #### -# ---------------------------------------------------------------------------# -load("data/r_objects/range_maps.RData") -target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)]) - -occs = tbl(con, "species_occurrences") %>% - dplyr::filter(species %in% target_species) %>% - dplyr::select(-year) %>% - dplyr::distinct() %>% - collect() %>% - sf::st_as_sf(coords = c("longitude", "latitude"), remove = F, crs = sf::st_crs(4326)) %>% - sf::st_filter(sa_polygon) - -occs_flagged = occs %>% - dplyr::distinct(species, coordinate_id, longitude, latitude) %>% - group_by(species) %>% - group_split() %>% - purrr::map( # Loop over species individually due to bug in CoordinateCleaner - CoordinateCleaner::clean_coordinates, - lon = "longitude", - lat = "latitude", - tests = c("centroids", "gbif", "equal", "institutions", "outliers"), - outliers_method = "quantile", - verbose = F - ) %>% - bind_rows() %>% - dplyr::filter(.summary == T) %>% - dplyr::select(species, coordinate_id, longitude, latitude) - -env_vars = tbl(con, "raster_extracts_num") %>% - dplyr::filter( - coordinate_id %in% occs$coordinate_id, - metric == "mean" - ) %>% - arrange(raster_layer_id) %>% - tidyr::pivot_wider(id_cols = coordinate_id, names_from = raster_layer_id, names_prefix = "layer_", values_from = value) %>% - collect() - -occs_final = occs %>% - inner_join(occs_flagged, by = c("species", "coordinate_id", "longitude", "latitude")) %>% - inner_join(env_vars, by = "coordinate_id") %>% - dplyr::select(-coordinate_id) - -save(occs_final, file = "data/r_objects/occs_final.RData") - -# ---------------------------------------------------------------------------# -# Create Modeling dataset #### -# ---------------------------------------------------------------------------# -load("data/r_objects/occs_final.RData") -load("data/r_objects/sa_polygon.RData") -sf::sf_use_s2(use_s2 = FALSE) - -occs_split = split(occs_final, occs_final$species) - -future::plan("multisession", workers = 16) -model_data = furrr::future_map(occs_split, .progress = TRUE, .options = furrr::furrr_options(seed = 42), .f = function(occs_spec){ - # Define model/sampling region - occs_bbox = occs_spec %>% - sf::st_bbox() %>% - expand_bbox(min_span = 1, expansion = 0.25) %>% - sf::st_as_sfc() %>% - st_set_crs(st_crs(occs_spec)) - - sample_region = suppressMessages( - st_as_sf(st_intersection(occs_bbox, sa_polygon)) - ) - - # Sample pseudo absence - sample_points = st_as_sf(st_sample(sample_region, nrow(occs_spec))) %>% - dplyr::mutate( - species = occs_spec$species[1], - longitude = sf::st_coordinates(.)[,1], - latitude = sf::st_coordinates(.)[,2] - ) %>% - dplyr::rename(geometry = "x") - - abs_spec = terra::rast(raster_filepaths) %>% - setNames(paste0("layer_", 1:19)) %>% - terra::extract(sample_points) %>% - dplyr::select(-ID) %>% - dplyr::mutate( - present = 0, - geometry = sample_points$x - ) %>% - tibble() %>% - bind_cols(sample_points) - - # Create presence-absence dataframe - pa_spec = occs_spec %>% - dplyr::mutate(present = 1) %>% - bind_rows(abs_spec) - - # Split into train and test datasets - train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE) - pa_spec$train = 0 - pa_spec$train[train_index] = 1 - - # Define cross-validation folds - folds = tryCatch({ - spatial_folds = suppressMessages( - blockCV::cv_spatial( - pa_spec, - column = "present", - k = 5, - progress = F, plot = F, report = F - ) - ) - - spatial_folds$folds_ids - }, warning = function(w){ - NA - }, error = function(e){ - NA - }) - - pa_spec$folds = folds - pa_spec$geometry = NULL - - return(pa_spec) -}) - -model_data = bind_rows(model_data) -save(model_data, file = "data/r_objects/model_data.RData") -- GitLab From e32d220e9b11a5e4d6d6b69d29ef15b9695bfd9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Thu, 19 Dec 2024 14:41:28 +0100 Subject: [PATCH 2/7] work on reverse niche --- .gitignore | 1 + R/03_02_absence_preparation.R | 58 +++++++++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index ac6cec0..3acf079 100644 --- a/.gitignore +++ b/.gitignore @@ -11,5 +11,6 @@ renv/cache/ # Data files data/ +plots/ R/*/ R/*.html \ No newline at end of file diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R index b72d132..5a34407 100644 --- a/R/03_02_absence_preparation.R +++ b/R/03_02_absence_preparation.R @@ -7,9 +7,13 @@ library(furrr) library(terra) library(CoordinateCleaner) library(sf) -library(MASS) library(geos) +# Stat packages +library(MASS) +library(dismo) +library(spatialEco) + # DB packages library(Symobio) library(DBI) @@ -49,18 +53,16 @@ range_maps = st_transform(range_maps, proj_string) # geographically close (reproduce sampling biases) but environmentally # # dissimilar (avoid false negatives) to the known occurrences # # ---------------------------------------------------------------------------# -# Geographic distance future::plan("multisession", workers = 16) pseudo_absences = furrr::future_map( .x = target_species, .options = furrr::furrr_options(seed = 42), .f = function(species){ - # Prepare occurrences + # Prepare species occurrence data ##### occs_spec = dplyr::filter(occs_final, species == !!species) - occs_poly = occs_spec %>% - summarise(geometry = st_combine(geometry)) %>% - st_concave_hull(ratio = 0.5) + occs_combined = occs_spec %>% + summarise(geometry = st_combine(geometry)) occs_bff = occs_spec %>% st_buffer(10000) %>% # Buffer by 10 kilometers as exclusion radius for absence sampling @@ -70,17 +72,47 @@ pseudo_absences = furrr::future_map( rename(species = name_matched) # Define model/sampling region - sampling_region = st_union(range_spec, occs_poly) %>% # Union occs poly and expert range - st_buffer(dist = 100000) %>% # Buffer by 100 km to extend sampling region into new environments - st_intersection(sa_polygon) %>% # Crop to South America - st_difference(occs_bff) # Crop out buffered presences + # 1. Union all point coordinates from range polygon and occurrences records + # 2. Build concave polygon from unionen points + # 3. Buffer by 100 km to extend sampling region into new environments + # 4. Crop by South America outline + sampling_region = st_cast(range_spec, "MULTIPOINT") %>% + st_union(occs_combined) %>% + geos::geos_concave_hull(ratio = 0.3) %>% # Concave hull in sf requires geos 3.11 + st_as_sf() %>% + st_set_crs(proj_string) %>% + st_buffer(dist = 100000) %>% + st_intersection(sa_polygon) - # Define KDE + # Define spatial KDE ref = st_bbox(sampling_region)[c(1,3,2,4)] min_extent = min(ref[2]-ref[1], ref[4]-ref[3]) occs_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% crop(sampling_region, mask = T, touches = F) + # Environmental KDE + env_sampling_region = raster::raster(terra::crop(raster_data, sampling_region, mask = T)) + maxent_fit <- maxent(env_sampling_region, st_coordinates(occs_spec)) + +# Step 3: Predict Across the Raster +# Predict environmental similarity (continuous output) +env_pred <- predict(maxent_model, env_sampling_region) + + + svm_model <- e1071::svm(x = env_spec, + nu = 0.01, + gamma = 0.00005, + type = 'one-classification', + kernel = "radial", + scale = FALSE) + + env_pred <- terra::predict(env_sampling_region, fun = function(x) { + dnorm(x, mean = gmm_model$parameters$mean, sd = gmm_model$parameters$variance$sigma) + }) + + env_pred = predict(env_sampling_region, svm_model, na.rm = T) + plot(env_pred) + # Sampling of pseudo absence abs_spec = st_sf(geometry = st_sfc(), crs = proj_string) while(nrow(abs_spec) < nrow(occs_spec)){ @@ -99,10 +131,10 @@ pseudo_absences = furrr::future_map( } plot(sa_polygon) - plot(occs_kde, add = T, legend = F) + #plot(occs_kde, add = T, legend = F) plot(range_spec, add = T, col = "#00000000") + plot(occs_spec, add = T, col = "red", pch = 3, cex = 0.5) plot(abs_spec, add = T, col = "black", pch = 3, cex = 0.5) - plot(occs_spec, add = T, col = "white", pch = 3, cex = 0.5) sample_points = st_as_sf(st_sample(sample_region, nrow(abs_spec))) %>% -- GitLab From f1914575bd20253a094c33854a6c89fc4102a4d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Fri, 3 Jan 2025 15:49:20 +0100 Subject: [PATCH 3/7] environmental variable selection --- ...ration.R => 01_01_range_map_preparation.R} | 0 R/01_02_raster_preparation.R | 55 ++++ renv.lock | 295 +++++++++++++++++- 3 files changed, 348 insertions(+), 2 deletions(-) rename R/{01_range_map_preparation.R => 01_01_range_map_preparation.R} (100%) create mode 100644 R/01_02_raster_preparation.R diff --git a/R/01_range_map_preparation.R b/R/01_01_range_map_preparation.R similarity index 100% rename from R/01_range_map_preparation.R rename to R/01_01_range_map_preparation.R diff --git a/R/01_02_raster_preparation.R b/R/01_02_raster_preparation.R new file mode 100644 index 0000000..fae113f --- /dev/null +++ b/R/01_02_raster_preparation.R @@ -0,0 +1,55 @@ +library(tidyverse) +library(terra) +library(rnaturalearth) +library(furrr) + +# Define Study extent #### +study_extent = rnaturalearth::ne_countries() %>% + dplyr::filter(continent == "South America") %>% + sf::st_union() %>% + sf::st_bbox() %>% + ext() + +# CHELSA bioclim target variables #### +chelsa_urls = c("https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_cmi_mean_1981-2010_V.2.1.tif", + "https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_rsds_1981-2010_mean_V.2.1.tif", + "https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_bio6_1981-2010_V.2.1.tif", + "https://os.zhdk.cloud.switch.ch/chelsav2/GLOBAL/climatologies/1981-2010/bio/CHELSA_bio17_1981-2010_V.2.1.tif") + +chelsa = terra::rast(chelsa_urls) %>% + crop(study_extent) +set.names(chelsa, c("cmi", "rsds", "bio6", "bio17")) +terra::writeRaster(chelsa, filename = file.path("data", "geospatial", "raster", basename(chelsa_urls)), overwrite = T) + +# Tree canopy cover #### +igfc = list.files("I:/mas_data/00_data/processed/iGFC/", pattern = "canopyDensity_(199[0-9]|200[0-9]|2010)", full.names = TRUE) %>% + terra::rast() %>% + terra::resample(chelsa) %>% + app(mean, na.rm = TRUE) +terra::set.names(igfc, "igfc") +terra::writeRaster(igfc, filename = file.path("data", "geospatial", "raster", "mean_canopyDensity_1992-2010.tif"), overwrite = T) + +# Seasonal inundation #### +igsw = list.files("I:/mas_data/00_data/processed/iGSW/", pattern = "seasonal_(199[0-9]|200[0-9]|2010)", full.names = TRUE) %>% + terra::rast() %>% + terra::resample(chelsa) %>% + app(mean, na.rm = TRUE) +terra::set.names(igsw, "igsw") +terra::writeRaster(igsw, filename = file.path("data", "geospatial", "raster", "mean_seasonalInundation_1992-2010.tif"), overwrite = T) + +# Distance to freshwater #### +dtfw = list.files("I:/mas_data/00_data/processed/linearDistance/", pattern = "lake|reservoirs|river", full.names = TRUE) %>% + terra::rast() %>% + crop(study_extent) %>% + min(na.rm = T) %>% + terra::resample(chelsa) +terra::set.names(dtfw, "dtfw") +terra::writeRaster(dtfw, filename = file.path("data", "geospatial", "raster", "mean_distanceToWater.tif"), overwrite = T) + +# Terrain #### +roughness = list.files("C:/Users/ye87zine/Downloads", pattern = "roughness", full.names = T) %>% + terra::rast() %>% + crop(study_extent) %>% + terra::resample(chelsa) +terra::set.names(roughness, "roughness") +terra::writeRaster(roughness, filename = file.path("data", "geospatial", "raster", "terrain_roughness.tif"), overwrite = T) diff --git a/renv.lock b/renv.lock index 1a9a7b4..ff0fc5a 100644 --- a/renv.lock +++ b/renv.lock @@ -8,7 +8,29 @@ } ] }, + "Bioconductor": { + "Version": "3.18" + }, "Packages": { + "BiocManager": { + "Package": "BiocManager", + "Version": "1.30.25", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "utils" + ], + "Hash": "3aec5928ca10897d7a0a1205aae64627" + }, + "BiocVersion": { + "Package": "BiocVersion", + "Version": "3.18.1", + "Source": "Bioconductor", + "Requirements": [ + "R" + ], + "Hash": "2ecaed86684f5fae76ed5530f9d29c4a" + }, "CoordinateCleaner": { "Package": "CoordinateCleaner", "Version": "3.0.1", @@ -42,6 +64,17 @@ ], "Hash": "065ae649b05f1ff66bb0c793107508f5" }, + "DEoptim": { + "Package": "DEoptim", + "Version": "2.2-8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "methods", + "parallel" + ], + "Hash": "3d75dd10e7228c9d734237dda013ed90" + }, "DT": { "Package": "DT", "Version": "0.33", @@ -211,6 +244,25 @@ ], "Hash": "2288423bb0f20a457800d7fc47f6aa54" }, + "ape": { + "Package": "ape", + "Version": "5.8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp", + "digest", + "graphics", + "lattice", + "methods", + "nlme", + "parallel", + "stats", + "utils" + ], + "Hash": "16b5ff4dff0ead9ea955f62f794b1535" + }, "arrow": { "Package": "arrow", "Version": "17.0.0.1", @@ -411,7 +463,7 @@ }, "caret": { "Package": "caret", - "Version": "6.0-94", + "Version": "7.0-1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -433,7 +485,7 @@ "utils", "withr" ], - "Hash": "528692344d5a174552e3bf7acdfbaebd" + "Hash": "304c0d28bda6454aae3b14ae953e7824" }, "cellranger": { "Package": "cellranger", @@ -560,6 +612,28 @@ ], "Hash": "5edbbabab6ce0bf7900a74fd4358628e" }, + "clusterGeneration": { + "Package": "clusterGeneration", + "Version": "1.3.8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "MASS", + "R" + ], + "Hash": "62df4b1fe7abefcf921dd7e52a801a6b" + }, + "coda": { + "Package": "coda", + "Version": "0.19-4.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "lattice" + ], + "Hash": "af436915c590afc6fffc3ce3a5be1569" + }, "codetools": { "Package": "codetools", "Version": "0.2-19", @@ -584,6 +658,20 @@ ], "Hash": "d954cb1c57e8d8b756165d7ba18aa55a" }, + "combinat": { + "Package": "combinat", + "Version": "0.0-8", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "f0acb9dcb71a9cd9d5ae233c5035b1c5" + }, + "commonmark": { + "Package": "commonmark", + "Version": "1.9.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "14eb0596f987c71535d07c3aff814742" + }, "conflicted": { "Package": "conflicted", "Version": "1.2.0", @@ -744,6 +832,20 @@ ], "Hash": "33698c4b3127fc9f506654607fb73676" }, + "doParallel": { + "Package": "doParallel", + "Version": "1.0.17", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "foreach", + "iterators", + "parallel", + "utils" + ], + "Hash": "451e5edf411987991ab6a5410c45011f" + }, "dplyr": { "Package": "dplyr", "Version": "1.1.4", @@ -850,6 +952,17 @@ ], "Hash": "3db813596387e90573ad092d5e3fde37" }, + "expm": { + "Package": "expm", + "Version": "1.0-0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "Matrix", + "methods" + ], + "Hash": "a44b2810f36c1cda5d52eee6ec96cafa" + }, "fansi": { "Package": "fansi", "Version": "1.0.6", @@ -876,6 +989,16 @@ "Repository": "CRAN", "Hash": "aa5e1cd11c2d15497494c5292d7ffcc8" }, + "fastmatch": { + "Package": "fastmatch", + "Version": "1.1-4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "8c406b7284bbaef08e01c6687367f195" + }, "filelock": { "Package": "filelock", "Version": "1.0.3", @@ -1572,6 +1695,18 @@ ], "Hash": "7ce2733a9826b3aeb1775d56fd305472" }, + "maps": { + "Package": "maps", + "Version": "3.4.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "graphics", + "utils" + ], + "Hash": "354a8094c634031421eeb5103f2e1f80" + }, "memoise": { "Package": "memoise", "Version": "2.0.1", @@ -1620,6 +1755,16 @@ ], "Hash": "785ef8e22389d4a7634c6c944f2dc07d" }, + "mnormt": { + "Package": "mnormt", + "Version": "2.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "c83992ef63553d1e4b97162a4a753470" + }, "modelr": { "Package": "modelr", "Version": "0.1.11", @@ -1716,6 +1861,18 @@ ], "Hash": "d413e0fef796c9401a4419485f709ca1" }, + "optimParallel": { + "Package": "optimParallel", + "Version": "1.0-2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "parallel", + "stats" + ], + "Hash": "5a15e068f0bd9bac4252f8f9e11dc79f" + }, "pROC": { "Package": "pROC", "Version": "1.18.5", @@ -1767,6 +1924,62 @@ ], "Hash": "abf0ca85c1c752e0d04f46334e635046" }, + "phangorn": { + "Package": "phangorn", + "Version": "2.12.1", + "Source": "Bioconductor", + "Repository": "CRAN", + "Requirements": [ + "Matrix", + "R", + "Rcpp", + "ape", + "digest", + "fastmatch", + "generics", + "grDevices", + "graphics", + "igraph", + "methods", + "parallel", + "quadprog", + "stats", + "utils" + ], + "Hash": "eea536597b96f3df44a2d6b963ac280e" + }, + "phytools": { + "Package": "phytools", + "Version": "2.3-0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "DEoptim", + "MASS", + "R", + "ape", + "clusterGeneration", + "coda", + "combinat", + "doParallel", + "expm", + "foreach", + "grDevices", + "graphics", + "maps", + "methods", + "mnormt", + "nlme", + "numDeriv", + "optimParallel", + "parallel", + "phangorn", + "scatterplot3d", + "stats", + "utils" + ], + "Hash": "592feaaac935ea62afa4399fd83201fd" + }, "pillar": { "Package": "pillar", "Version": "1.9.0", @@ -1969,6 +2182,16 @@ ], "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc" }, + "quadprog": { + "Package": "quadprog", + "Version": "1.5-8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "5f919ae5e7f83a6f91dcf2288943370d" + }, "ragg": { "Package": "ragg", "Version": "1.3.3", @@ -2305,6 +2528,19 @@ ], "Hash": "c19df082ba346b0ffa6f833e92de34d1" }, + "scatterplot3d": { + "Package": "scatterplot3d", + "Version": "0.3-44", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "stats" + ], + "Hash": "10ee4b91ec812690bd55d9bf51edccee" + }, "secretbase": { "Package": "secretbase", "Version": "1.0.3", @@ -2364,6 +2600,49 @@ ], "Hash": "5c47e84dc0a3ca761ae1d307889e796d" }, + "shiny": { + "Package": "shiny", + "Version": "1.9.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "bslib", + "cachem", + "commonmark", + "crayon", + "fastmap", + "fontawesome", + "glue", + "grDevices", + "htmltools", + "httpuv", + "jsonlite", + "later", + "lifecycle", + "methods", + "mime", + "promises", + "rlang", + "sourcetools", + "tools", + "utils", + "withr", + "xtable" + ], + "Hash": "6a293995a66e12c48d13aa1f957d09c7" + }, + "sourcetools": { + "Package": "sourcetools", + "Version": "0.1.7-1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "5f5a7629f956619d519205ec475fe647" + }, "sp": { "Package": "sp", "Version": "2.1-4", @@ -2889,6 +3168,18 @@ ], "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61" }, + "xtable": { + "Package": "xtable", + "Version": "1.8-4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "stats", + "utils" + ], + "Hash": "b8acdf8af494d9ec19ccb2481a9b11c2" + }, "yaml": { "Package": "yaml", "Version": "2.3.10", -- GitLab From aa163c9d326cadddf5a86a1b93e666c56cd85a9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Mon, 6 Jan 2025 11:41:34 +0100 Subject: [PATCH 4/7] updated presence preparation, fine tuned absence sampling --- R/01_02_raster_preparation.R | 10 +-- R/03_01_presence_preparation.R | 22 +++-- R/03_02_absence_preparation.R | 150 ++++++++++++++------------------- 3 files changed, 84 insertions(+), 98 deletions(-) diff --git a/R/01_02_raster_preparation.R b/R/01_02_raster_preparation.R index fae113f..1d6951a 100644 --- a/R/01_02_raster_preparation.R +++ b/R/01_02_raster_preparation.R @@ -32,17 +32,17 @@ terra::writeRaster(igfc, filename = file.path("data", "geospatial", "raster", "m # Seasonal inundation #### igsw = list.files("I:/mas_data/00_data/processed/iGSW/", pattern = "seasonal_(199[0-9]|200[0-9]|2010)", full.names = TRUE) %>% terra::rast() %>% - terra::resample(chelsa) %>% - app(mean, na.rm = TRUE) + terra::resample(chelsa, threads = 32) %>% + app(mean, na.rm = TRUE) %>% + subst(NA, 0) terra::set.names(igsw, "igsw") terra::writeRaster(igsw, filename = file.path("data", "geospatial", "raster", "mean_seasonalInundation_1992-2010.tif"), overwrite = T) # Distance to freshwater #### dtfw = list.files("I:/mas_data/00_data/processed/linearDistance/", pattern = "lake|reservoirs|river", full.names = TRUE) %>% terra::rast() %>% - crop(study_extent) %>% - min(na.rm = T) %>% - terra::resample(chelsa) + terra::resample(chelsa) %>% + app(mean, na.rm = TRUE) terra::set.names(dtfw, "dtfw") terra::writeRaster(dtfw, filename = file.path("data", "geospatial", "raster", "mean_distanceToWater.tif"), overwrite = T) diff --git a/R/03_01_presence_preparation.R b/R/03_01_presence_preparation.R index c8705b9..6344de7 100644 --- a/R/03_01_presence_preparation.R +++ b/R/03_01_presence_preparation.R @@ -20,14 +20,15 @@ sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry # ---------------------------------------------------------------------------# # Prepare Geodata #### # ---------------------------------------------------------------------------# -# TODO: finalize predictor choice -raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% - stringr::str_sort(numeric = T) - sa_polygon = rnaturalearth::ne_countries() %>% dplyr::filter(continent == "South America") %>% sf::st_union() +raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% + stringr::str_sort(numeric = T) + +raster_data = terra::rast(raster_filepaths) + # ---------------------------------------------------------------------------# # Prepare Occurrence Data #### # ---------------------------------------------------------------------------# @@ -60,9 +61,18 @@ occs_flagged = occs %>% dplyr::filter(.summary == T) %>% dplyr::select(species, coordinate_id, longitude, latitude) -# Finalize and save occurrence dataset +# Subset species occurrences to validated records occs_final = occs %>% inner_join(occs_flagged, by = c("species", "coordinate_id", "longitude", "latitude")) %>% dplyr::select(-coordinate_id) -save(occs_final, file = "data/r_objects/occs_final.RData") \ No newline at end of file +# Extract environmental data +proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" + +raster_data = terra::project(raster_data, proj_string) +occs_final = st_transform(occs_final, proj_string) +env_data = extract(raster_data, vect(occs_final), ID = F) + +# Save occurrences +occs_final = bind_cols(occs_final, env_data) +save(occs_final, file = "data/r_objects/occs_final.RData") diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R index 5a34407..6104ae6 100644 --- a/R/03_02_absence_preparation.R +++ b/R/03_02_absence_preparation.R @@ -13,10 +13,8 @@ library(geos) library(MASS) library(dismo) library(spatialEco) - -# DB packages -library(Symobio) -library(DBI) +library(blockCV) +library(caret) source("R/utils.R") @@ -32,92 +30,80 @@ sa_polygon = rnaturalearth::ne_countries() %>% dplyr::filter(continent == "South America") %>% sf::st_union() -raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice +raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% + #raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice stringr::str_sort(numeric = T) -raster_data = terra::rast(raster_filepaths) %>% - terra::crop(sf::st_bbox(sa_polygon)) - # Project (proj string generated with SA bbox coordinates on https://projectionwizard.org) proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" -raster_data = terra::project(raster_data, proj_string) +# raster_data = terra::project(raster_data, proj_string) sa_polygon = st_transform(sa_polygon, proj_string) occs_final = st_transform(occs_final, proj_string) range_maps = st_transform(range_maps, proj_string) # ---------------------------------------------------------------------------# -# 1. Distance-based sampling #### +# Sample absences #### # # # Absences should be predominantly sampled from regions that are # # geographically close (reproduce sampling biases) but environmentally # # dissimilar (avoid false negatives) to the known occurrences # # ---------------------------------------------------------------------------# -future::plan("multisession", workers = 16) -pseudo_absences = furrr::future_map( - .x = target_species, - .options = furrr::furrr_options(seed = 42), +future::plan("multisession", workers = 24) + +model_data = furrr::future_walk( + .x = target_species, + .options = furrr::furrr_options(seed = 42), + .env_globals = c(raster_filepaths, sa_polygon, occs_final, range_maps, proj_string), .f = function(species){ # Prepare species occurrence data ##### occs_spec = dplyr::filter(occs_final, species == !!species) - occs_combined = occs_spec %>% + occs_multipoint = occs_spec %>% summarise(geometry = st_combine(geometry)) - occs_bff = occs_spec %>% - st_buffer(10000) %>% # Buffer by 10 kilometers as exclusion radius for absence sampling - st_union() - range_spec = dplyr::filter(range_maps, name_matched == !!species) %>% rename(species = name_matched) # Define model/sampling region - # 1. Union all point coordinates from range polygon and occurrences records + # 1. Union all points from range polygon and occurrences records # 2. Build concave polygon from unionen points # 3. Buffer by 100 km to extend sampling region into new environments # 4. Crop by South America outline sampling_region = st_cast(range_spec, "MULTIPOINT") %>% - st_union(occs_combined) %>% + st_union(occs_multipoint) %>% geos::geos_concave_hull(ratio = 0.3) %>% # Concave hull in sf requires geos 3.11 st_as_sf() %>% st_set_crs(proj_string) %>% st_buffer(dist = 100000) %>% st_intersection(sa_polygon) + # Load raster data + raster_data = terra::rast(raster_filepaths) %>% + terra::project(proj_string) %>% + terra::crop(sf::st_bbox(sampling_region)) + # Define spatial KDE ref = st_bbox(sampling_region)[c(1,3,2,4)] min_extent = min(ref[2]-ref[1], ref[4]-ref[3]) - occs_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% + spatial_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% crop(sampling_region, mask = T, touches = F) - # Environmental KDE - env_sampling_region = raster::raster(terra::crop(raster_data, sampling_region, mask = T)) - maxent_fit <- maxent(env_sampling_region, st_coordinates(occs_spec)) - -# Step 3: Predict Across the Raster -# Predict environmental similarity (continuous output) -env_pred <- predict(maxent_model, env_sampling_region) - - - svm_model <- e1071::svm(x = env_spec, - nu = 0.01, - gamma = 0.00005, - type = 'one-classification', - kernel = "radial", - scale = FALSE) - - env_pred <- terra::predict(env_sampling_region, fun = function(x) { - dnorm(x, mean = gmm_model$parameters$mean, sd = gmm_model$parameters$variance$sigma) - }) + # Define reverse niche + env_sampling_region = terra::crop(raster_data, sampling_region, mask = T) %>% + terra::resample(spatial_kde) %>% + raster::stack() - env_pred = predict(env_sampling_region, svm_model, na.rm = T) - plot(env_pred) + maxent_fit <- maxent(env_sampling_region, st_coordinates(occs_spec)) + env_kde <- (1-predict(env_sampling_region, maxent_fit)) %>% + terra::rast() - # Sampling of pseudo absence + # Rejection sampling of pseudo absences abs_spec = st_sf(geometry = st_sfc(), crs = proj_string) + abs_prob = spatial_kde * env_kde while(nrow(abs_spec) < nrow(occs_spec)){ sample_points_raw = st_sample(sampling_region, nrow(occs_spec)) - p = extract(occs_kde, vect(sample_points_raw))$z + p = extract(abs_prob, vect(sample_points_raw))$z x = runif(length(p)) sample_points = sample_points_raw[x<p] @@ -127,42 +113,40 @@ env_pred <- predict(maxent_model, env_sampling_region) sample_points = sample(sample_points, samples_required) } - abs_spec = bind_rows(abs_spec, st_sf(geometry = sample_points)) + abs_env = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE) %>% + dplyr::mutate( + present = 0, + geometry = st_geometry(sample_points) + ) %>% + drop_na() + + abs_spec = bind_rows(abs_spec, abs_env) } - plot(sa_polygon) - #plot(occs_kde, add = T, legend = F) - plot(range_spec, add = T, col = "#00000000") - plot(occs_spec, add = T, col = "red", pch = 3, cex = 0.5) - plot(abs_spec, add = T, col = "black", pch = 3, cex = 0.5) - - - sample_points = st_as_sf(st_sample(sample_region, nrow(abs_spec))) %>% - dplyr::mutate( - species = occs_spec$species[1], - longitude = sf::st_coordinates(.)[,1], - latitude = sf::st_coordinates(.)[,2] - ) %>% - dplyr::rename(geometry = "x") - - abs_spec = terra::rast(raster_filepaths) %>% - setNames(paste0("layer_", 1:19)) %>% - terra::extract(sample_points) %>% - dplyr::select(-ID) %>% - dplyr::mutate( - present = 0, - geometry = sample_points$x - ) %>% - tibble() %>% - bind_cols(sample_points) + abs_coords = st_coordinates(st_transform(abs_spec, "EPSG:4326")) + abs_spec = bind_cols(abs_spec, abs_coords) %>% + dplyr::rename(longitude = X, latitude = Y) %>% + dplyr::mutate(species = !!species) # Create presence-absence dataframe pa_spec = occs_spec %>% dplyr::mutate(present = 1) %>% bind_rows(abs_spec) + # ggplot() + + # ggtitle(species) + + # geom_sf(data = st_crop(sa_polygon, ref), fill = "#756445") + + # tidyterra::geom_spatraster(data = abs_prob) + + # scale_fill_gradient(low="darkblue", high="green", na.value=NA, name = "Absence sampling probability") + + # geom_sf(data = range_spec, alpha = 0, color = "black") + + # geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) + + # scale_color_discrete(name = "Presence") + + # theme_minimal() + # + # ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 2) + # Split into train and test datasets - train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE) + train_index = caret::createDataPartition(pa_spec$present, p = 0.7, list = FALSE) pa_spec$train = 0 pa_spec$train[train_index] = 1 @@ -184,23 +168,15 @@ env_pred <- predict(maxent_model, env_sampling_region) NA }) - pa_spec$folds = folds + pa_spec$fold = folds pa_spec$geometry = NULL - return(pa_spec) + save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData")) }) -model_data = bind_rows(model_data) -save(model_data, file = "data/r_objects/model_data.RData") - - - +# Combine presence absence data +files = list.files("data/r_objects/pa_sampling/", full.names = T) +model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% + bind_rows() -ggplot() + - geom_sf(data = sa_polygon, fill = "red") + - geom_sf(data = range_spec, fill = "green") + - geom_sf(data = occs_spec) + - # facet_wrap("species", nrow = 2) + - # theme_minimal() - # - \ No newline at end of file +save(model_data, file = "data/r_objects/model_data.RData") \ No newline at end of file -- GitLab From 2f676f2723389feda85ba208cd17d7ff35e75fe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Thu, 9 Jan 2025 11:08:45 +0100 Subject: [PATCH 5/7] some tweaks in presence and absence preparation --- R/03_01_presence_preparation.R | 3 ++- R/03_02_absence_preparation.R | 16 +++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/R/03_01_presence_preparation.R b/R/03_01_presence_preparation.R index 6344de7..9fb6ee3 100644 --- a/R/03_01_presence_preparation.R +++ b/R/03_01_presence_preparation.R @@ -74,5 +74,6 @@ occs_final = st_transform(occs_final, proj_string) env_data = extract(raster_data, vect(occs_final), ID = F) # Save occurrences -occs_final = bind_cols(occs_final, env_data) +occs_final = bind_cols(occs_final, env_data) %>% + drop_na() save(occs_final, file = "data/r_objects/occs_final.RData") diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R index 6104ae6..951dded 100644 --- a/R/03_02_absence_preparation.R +++ b/R/03_02_absence_preparation.R @@ -103,12 +103,15 @@ model_data = furrr::future_walk( abs_prob = spatial_kde * env_kde while(nrow(abs_spec) < nrow(occs_spec)){ sample_points_raw = st_sample(sampling_region, nrow(occs_spec)) - p = extract(abs_prob, vect(sample_points_raw))$z + p = na.omit(extract(abs_prob, vect(sample_points_raw))$z) x = runif(length(p)) sample_points = sample_points_raw[x<p] + if(length(sample_points) == 0){ + next + } + samples_required = nrow(occs_spec) - nrow(abs_spec) - if(length(sample_points) > samples_required){ sample_points = sample(sample_points, samples_required) } @@ -145,11 +148,6 @@ model_data = furrr::future_walk( # # ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 2) - # Split into train and test datasets - train_index = caret::createDataPartition(pa_spec$present, p = 0.7, list = FALSE) - pa_spec$train = 0 - pa_spec$train[train_index] = 1 - # Define cross-validation folds folds = tryCatch({ spatial_folds = suppressMessages( @@ -174,9 +172,9 @@ model_data = furrr::future_walk( save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData")) }) -# Combine presence absence data +# Combine presence-absence data files = list.files("data/r_objects/pa_sampling/", full.names = T) model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% bind_rows() -save(model_data, file = "data/r_objects/model_data.RData") \ No newline at end of file +save(model_data, file = "data/r_objects/pa_sampling/model_data.RData") -- GitLab From a72dfbf83cac22e6eabfc339807ea9a1e1f36f56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Thu, 9 Jan 2025 12:41:59 +0100 Subject: [PATCH 6/7] filtered duplicate occurrences, auto bandwidth in spatial kde --- R/03_01_presence_preparation.R | 8 ++++++-- R/03_02_absence_preparation.R | 29 +++++++++++++++-------------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/R/03_01_presence_preparation.R b/R/03_01_presence_preparation.R index 9fb6ee3..8d3fce3 100644 --- a/R/03_01_presence_preparation.R +++ b/R/03_01_presence_preparation.R @@ -73,7 +73,11 @@ raster_data = terra::project(raster_data, proj_string) occs_final = st_transform(occs_final, proj_string) env_data = extract(raster_data, vect(occs_final), ID = F) -# Save occurrences +# Merge data + final processing occs_final = bind_cols(occs_final, env_data) %>% - drop_na() + drop_na() %>% # Remove records with NA env vars + group_by(species) %>% + distinct() # Remove duplicate records + +# Save occurrences save(occs_final, file = "data/r_objects/occs_final.RData") diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R index 951dded..2390512 100644 --- a/R/03_02_absence_preparation.R +++ b/R/03_02_absence_preparation.R @@ -85,8 +85,7 @@ model_data = furrr::future_walk( # Define spatial KDE ref = st_bbox(sampling_region)[c(1,3,2,4)] - min_extent = min(ref[2]-ref[1], ref[4]-ref[3]) - spatial_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% + spatial_kde = spatialEco::sf.kde(occs_spec, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% crop(sampling_region, mask = T, touches = F) # Define reverse niche @@ -110,7 +109,7 @@ model_data = furrr::future_walk( if(length(sample_points) == 0){ next } - + samples_required = nrow(occs_spec) - nrow(abs_spec) if(length(sample_points) > samples_required){ sample_points = sample(sample_points, samples_required) @@ -136,17 +135,19 @@ model_data = furrr::future_walk( dplyr::mutate(present = 1) %>% bind_rows(abs_spec) - # ggplot() + - # ggtitle(species) + - # geom_sf(data = st_crop(sa_polygon, ref), fill = "#756445") + - # tidyterra::geom_spatraster(data = abs_prob) + - # scale_fill_gradient(low="darkblue", high="green", na.value=NA, name = "Absence sampling probability") + - # geom_sf(data = range_spec, alpha = 0, color = "black") + - # geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) + - # scale_color_discrete(name = "Presence") + - # theme_minimal() - # - # ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 2) + ggplot() + + ggtitle(species) + + geom_sf(data = st_crop(sa_polygon, ref), fill = "#756445") + + tidyterra::geom_spatraster(data = abs_prob) + + scale_fill_gradient(low="darkblue", high="green", na.value=NA, name = "Absence sampling probability") + + geom_sf(data = range_spec, alpha = 0, color = "black") + + geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) + + scale_color_discrete(name = "Presence") + + theme_minimal() + + try({ + ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 2) + }) # Define cross-validation folds folds = tryCatch({ -- GitLab From e3da5ed40a04a46b91552915633f8ce3c67a26ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Fri, 10 Jan 2025 14:34:00 +0100 Subject: [PATCH 7/7] improved implementation of absence sampling algorithnm --- R/03_02_absence_preparation.R | 86 ++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 32 deletions(-) diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R index 2390512..914ddb7 100644 --- a/R/03_02_absence_preparation.R +++ b/R/03_02_absence_preparation.R @@ -49,15 +49,19 @@ range_maps = st_transform(range_maps, proj_string) # geographically close (reproduce sampling biases) but environmentally # # dissimilar (avoid false negatives) to the known occurrences # # ---------------------------------------------------------------------------# -future::plan("multisession", workers = 24) +future::plan("multisession", workers = 16) model_data = furrr::future_walk( .x = target_species, - .options = furrr::furrr_options(seed = 42), + .options = furrr::furrr_options(seed = 42, scheduling = FALSE), # make sure workers get constant stream of work .env_globals = c(raster_filepaths, sa_polygon, occs_final, range_maps, proj_string), .f = function(species){ # Prepare species occurrence data ##### occs_spec = dplyr::filter(occs_final, species == !!species) + if(nrow(occs_spec) <= 1){ + warning("Skipping species ", species, ": Can't process species with a single presence record." ) + return() + } occs_multipoint = occs_spec %>% summarise(geometry = st_combine(geometry)) @@ -85,7 +89,19 @@ model_data = furrr::future_walk( # Define spatial KDE ref = st_bbox(sampling_region)[c(1,3,2,4)] - spatial_kde = spatialEco::sf.kde(occs_spec, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% + + # Function to estimate bandwidth + # ---> this function is the default estimation of the sf.kde package but with a minimum of 50000 = 50km) + calc_bw <- function(x) { + r <- stats::quantile(x, c(0.25, 0.75)) + h <- (r[2] - r[1])/1.34 + result = 4 * 1.06 * min(sqrt(stats::var(x)), h) * length(x)^(-1/5) + return(max(result, 50000)) + } + + bw_x = calc_bw(st_coordinates(occs_spec)[,"X"]) + bw_y = calc_bw(st_coordinates(occs_spec)[,"Y"]) + spatial_kde = spatialEco::sf.kde(occs_spec, bw = c(bw_x, bw_y), res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% crop(sampling_region, mask = T, touches = F) # Define reverse niche @@ -97,38 +113,43 @@ model_data = furrr::future_walk( env_kde <- (1-predict(env_sampling_region, maxent_fit)) %>% terra::rast() - # Rejection sampling of pseudo absences - abs_spec = st_sf(geometry = st_sfc(), crs = proj_string) + # Sample pseudo absences abs_prob = spatial_kde * env_kde - while(nrow(abs_spec) < nrow(occs_spec)){ - sample_points_raw = st_sample(sampling_region, nrow(occs_spec)) - p = na.omit(extract(abs_prob, vect(sample_points_raw))$z) - x = runif(length(p)) - - sample_points = sample_points_raw[x<p] - if(length(sample_points) == 0){ - next - } - - samples_required = nrow(occs_spec) - nrow(abs_spec) - if(length(sample_points) > samples_required){ - sample_points = sample(sample_points, samples_required) - } + abs_spec_list = list() + + samples_required = nrow(occs_spec) + + while(samples_required != 0){ + sample_points = spatSample(abs_prob, size = samples_required, method = "weights", xy = T, as.df = T) %>% + dplyr::select(-z) %>% + dplyr::mutate(x = x + runif(nrow(.), -5000, 5000), # Add jitter (res/2) to cell centroids + y = y + runif(nrow(.), -5000, 5000)) %>% # Add jitter (res/2) to cell centroids + st_as_sf(coords = c("x", "y"), crs = proj_string) - abs_env = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE) %>% - dplyr::mutate( - present = 0, - geometry = st_geometry(sample_points) - ) %>% + sample_abs = bind_cols( + terra::extract(raster_data, terra::vect(sample_points), ID = FALSE) + ) %>% + bind_cols(sample_points) %>% drop_na() - abs_spec = bind_rows(abs_spec, abs_env) + abs_spec_list[[length(abs_spec_list)+1]] = sample_points + + samples_required = samples_required - nrow(sample_points) # Sometimes there are no env data for sample points, so keep sampling } - abs_coords = st_coordinates(st_transform(abs_spec, "EPSG:4326")) - abs_spec = bind_cols(abs_spec, abs_coords) %>% - dplyr::rename(longitude = X, latitude = Y) %>% - dplyr::mutate(species = !!species) + abs_spec = bind_rows(abs_spec_list) + abs_spec = abs_spec %>% + bind_cols( + st_coordinates(st_transform(abs_spec, "EPSG:4326")) + ) %>% + dplyr::rename( + longitude = X, + latitude = Y + ) %>% + dplyr::mutate( + species = !!species, + present = 0 + ) # Create presence-absence dataframe pa_spec = occs_spec %>% @@ -167,15 +188,16 @@ model_data = furrr::future_walk( NA }) - pa_spec$fold = folds + pa_spec$fold_eval = folds pa_spec$geometry = NULL save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData")) - }) + } +) # Combine presence-absence data files = list.files("data/r_objects/pa_sampling/", full.names = T) model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% bind_rows() -save(model_data, file = "data/r_objects/pa_sampling/model_data.RData") +save(model_data, file = "data/r_objects/model_data_pa_sampling.RData") -- GitLab