From c33c0424fc345f738efc284ba5b47dfde9643702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= <ye87zine@usr.idiv.de> Date: Wed, 18 Dec 2024 22:27:33 +0100 Subject: [PATCH] protoyped kde based absence sampling --- R/03_01_presence_preparation.R | 68 +++++++++++ R/03_02_absence_preparation.R | 174 ++++++++++++++++++++++++++++ R/03_presence_absence_preparation.R | 160 ------------------------- 3 files changed, 242 insertions(+), 160 deletions(-) create mode 100644 R/03_01_presence_preparation.R create mode 100644 R/03_02_absence_preparation.R delete mode 100644 R/03_presence_absence_preparation.R diff --git a/R/03_01_presence_preparation.R b/R/03_01_presence_preparation.R new file mode 100644 index 0000000..c8705b9 --- /dev/null +++ b/R/03_01_presence_preparation.R @@ -0,0 +1,68 @@ +# General packages +library(dplyr) +library(tidyr) +library(furrr) + +# Geo packages +library(terra) +library(CoordinateCleaner) +library(sf) + +# DB packages +library(Symobio) +library(DBI) + +source("R/utils.R") + +con = db_connect() # Connection to Symobio +sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry + +# ---------------------------------------------------------------------------# +# Prepare Geodata #### +# ---------------------------------------------------------------------------# +# TODO: finalize predictor choice +raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% + stringr::str_sort(numeric = T) + +sa_polygon = rnaturalearth::ne_countries() %>% + dplyr::filter(continent == "South America") %>% + sf::st_union() + +# ---------------------------------------------------------------------------# +# Prepare Occurrence Data #### +# ---------------------------------------------------------------------------# +load("data/r_objects/range_maps.RData") +target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)]) + +# Query species from Symobio +occs = tbl(con, "species_occurrences") %>% + dplyr::filter(species %in% target_species) %>% + dplyr::select(-year) %>% + dplyr::distinct() %>% + collect() %>% + sf::st_as_sf(coords = c("longitude", "latitude"), remove = F, crs = sf::st_crs(4326)) %>% + sf::st_filter(sa_polygon) + +# Flag occurrences using Coordinate cleaner +occs_flagged = occs %>% + dplyr::distinct(species, coordinate_id, longitude, latitude) %>% + group_by(species) %>% + group_split() %>% + purrr::map( # Loop over species individually due to bug in CoordinateCleaner + CoordinateCleaner::clean_coordinates, + lon = "longitude", + lat = "latitude", + tests = c("centroids", "gbif", "equal", "institutions", "outliers"), + outliers_method = "quantile", + verbose = F + ) %>% + bind_rows() %>% + dplyr::filter(.summary == T) %>% + dplyr::select(species, coordinate_id, longitude, latitude) + +# Finalize and save occurrence dataset +occs_final = occs %>% + inner_join(occs_flagged, by = c("species", "coordinate_id", "longitude", "latitude")) %>% + dplyr::select(-coordinate_id) + +save(occs_final, file = "data/r_objects/occs_final.RData") \ No newline at end of file diff --git a/R/03_02_absence_preparation.R b/R/03_02_absence_preparation.R new file mode 100644 index 0000000..b72d132 --- /dev/null +++ b/R/03_02_absence_preparation.R @@ -0,0 +1,174 @@ +# General packages +library(dplyr) +library(tidyr) +library(furrr) + +# Geo packages +library(terra) +library(CoordinateCleaner) +library(sf) +library(MASS) +library(geos) + +# DB packages +library(Symobio) +library(DBI) + +source("R/utils.R") + +load("data/r_objects/range_maps.RData") +load("data/r_objects/occs_final.RData") + +# ---------------------------------------------------------------------------# +# Prepare Geodata #### +# ---------------------------------------------------------------------------# +target_species = unique(occs_final$species) + +sa_polygon = rnaturalearth::ne_countries() %>% + dplyr::filter(continent == "South America") %>% + sf::st_union() + +raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice + stringr::str_sort(numeric = T) + +raster_data = terra::rast(raster_filepaths) %>% + terra::crop(sf::st_bbox(sa_polygon)) + +# Project (proj string generated with SA bbox coordinates on https://projectionwizard.org) +proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" + +raster_data = terra::project(raster_data, proj_string) +sa_polygon = st_transform(sa_polygon, proj_string) +occs_final = st_transform(occs_final, proj_string) +range_maps = st_transform(range_maps, proj_string) + +# ---------------------------------------------------------------------------# +# 1. Distance-based sampling #### +# # +# Absences should be predominantly sampled from regions that are # +# geographically close (reproduce sampling biases) but environmentally # +# dissimilar (avoid false negatives) to the known occurrences # +# ---------------------------------------------------------------------------# +# Geographic distance +future::plan("multisession", workers = 16) +pseudo_absences = furrr::future_map( + .x = target_species, + .options = furrr::furrr_options(seed = 42), + .f = function(species){ + # Prepare occurrences + occs_spec = dplyr::filter(occs_final, species == !!species) + + occs_poly = occs_spec %>% + summarise(geometry = st_combine(geometry)) %>% + st_concave_hull(ratio = 0.5) + + occs_bff = occs_spec %>% + st_buffer(10000) %>% # Buffer by 10 kilometers as exclusion radius for absence sampling + st_union() + + range_spec = dplyr::filter(range_maps, name_matched == !!species) %>% + rename(species = name_matched) + + # Define model/sampling region + sampling_region = st_union(range_spec, occs_poly) %>% # Union occs poly and expert range + st_buffer(dist = 100000) %>% # Buffer by 100 km to extend sampling region into new environments + st_intersection(sa_polygon) %>% # Crop to South America + st_difference(occs_bff) # Crop out buffered presences + + # Define KDE + ref = st_bbox(sampling_region)[c(1,3,2,4)] + min_extent = min(ref[2]-ref[1], ref[4]-ref[3]) + occs_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>% + crop(sampling_region, mask = T, touches = F) + + # Sampling of pseudo absence + abs_spec = st_sf(geometry = st_sfc(), crs = proj_string) + while(nrow(abs_spec) < nrow(occs_spec)){ + sample_points_raw = st_sample(sampling_region, nrow(occs_spec)) + p = extract(occs_kde, vect(sample_points_raw))$z + x = runif(length(p)) + + sample_points = sample_points_raw[x<p] + samples_required = nrow(occs_spec) - nrow(abs_spec) + + if(length(sample_points) > samples_required){ + sample_points = sample(sample_points, samples_required) + } + + abs_spec = bind_rows(abs_spec, st_sf(geometry = sample_points)) + } + + plot(sa_polygon) + plot(occs_kde, add = T, legend = F) + plot(range_spec, add = T, col = "#00000000") + plot(abs_spec, add = T, col = "black", pch = 3, cex = 0.5) + plot(occs_spec, add = T, col = "white", pch = 3, cex = 0.5) + + + sample_points = st_as_sf(st_sample(sample_region, nrow(abs_spec))) %>% + dplyr::mutate( + species = occs_spec$species[1], + longitude = sf::st_coordinates(.)[,1], + latitude = sf::st_coordinates(.)[,2] + ) %>% + dplyr::rename(geometry = "x") + + abs_spec = terra::rast(raster_filepaths) %>% + setNames(paste0("layer_", 1:19)) %>% + terra::extract(sample_points) %>% + dplyr::select(-ID) %>% + dplyr::mutate( + present = 0, + geometry = sample_points$x + ) %>% + tibble() %>% + bind_cols(sample_points) + + # Create presence-absence dataframe + pa_spec = occs_spec %>% + dplyr::mutate(present = 1) %>% + bind_rows(abs_spec) + + # Split into train and test datasets + train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE) + pa_spec$train = 0 + pa_spec$train[train_index] = 1 + + # Define cross-validation folds + folds = tryCatch({ + spatial_folds = suppressMessages( + blockCV::cv_spatial( + pa_spec, + column = "present", + k = 5, + progress = F, plot = F, report = F + ) + ) + + spatial_folds$folds_ids + }, warning = function(w){ + NA + }, error = function(e){ + NA + }) + + pa_spec$folds = folds + pa_spec$geometry = NULL + + return(pa_spec) + }) + +model_data = bind_rows(model_data) +save(model_data, file = "data/r_objects/model_data.RData") + + + + +ggplot() + + geom_sf(data = sa_polygon, fill = "red") + + geom_sf(data = range_spec, fill = "green") + + geom_sf(data = occs_spec) + + # facet_wrap("species", nrow = 2) + + # theme_minimal() + # + \ No newline at end of file diff --git a/R/03_presence_absence_preparation.R b/R/03_presence_absence_preparation.R deleted file mode 100644 index 9fd817c..0000000 --- a/R/03_presence_absence_preparation.R +++ /dev/null @@ -1,160 +0,0 @@ -# General packages -library(dplyr) -library(tidyr) -library(furrr) - -# Geo packages -library(terra) -library(CoordinateCleaner) -library(sf) - -# DB packages -library(Symobio) -library(DBI) - -source("R/utils.R") - -con = db_connect() # Connection to Symobio -sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry - -# ---------------------------------------------------------------------------# -# Prepare Geodata #### -# ---------------------------------------------------------------------------# -raster_info = tbl(con, "datasets") %>% - dplyr::filter(stringr::str_detect(name, "CHELSA")) %>% - collect() - -# raster_filepaths = list.files(raster_info$raw_data_path, pattern = ".tif$", full.names = T) %>% -# stringr::str_sort(numeric = T) - -raster_filepaths = list.files("I:/mas_data/00_data/processed/CHELSA_v2-1_bioclim", pattern = ".tif$", full.names = T) %>% - stringr::str_sort(numeric = T) - -sa_polygon = rnaturalearth::ne_countries() %>% - dplyr::filter(continent == "South America") %>% - sf::st_union() - -# ---------------------------------------------------------------------------# -# Prepare Occurrence Data #### -# ---------------------------------------------------------------------------# -load("data/r_objects/range_maps.RData") -target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)]) - -occs = tbl(con, "species_occurrences") %>% - dplyr::filter(species %in% target_species) %>% - dplyr::select(-year) %>% - dplyr::distinct() %>% - collect() %>% - sf::st_as_sf(coords = c("longitude", "latitude"), remove = F, crs = sf::st_crs(4326)) %>% - sf::st_filter(sa_polygon) - -occs_flagged = occs %>% - dplyr::distinct(species, coordinate_id, longitude, latitude) %>% - group_by(species) %>% - group_split() %>% - purrr::map( # Loop over species individually due to bug in CoordinateCleaner - CoordinateCleaner::clean_coordinates, - lon = "longitude", - lat = "latitude", - tests = c("centroids", "gbif", "equal", "institutions", "outliers"), - outliers_method = "quantile", - verbose = F - ) %>% - bind_rows() %>% - dplyr::filter(.summary == T) %>% - dplyr::select(species, coordinate_id, longitude, latitude) - -env_vars = tbl(con, "raster_extracts_num") %>% - dplyr::filter( - coordinate_id %in% occs$coordinate_id, - metric == "mean" - ) %>% - arrange(raster_layer_id) %>% - tidyr::pivot_wider(id_cols = coordinate_id, names_from = raster_layer_id, names_prefix = "layer_", values_from = value) %>% - collect() - -occs_final = occs %>% - inner_join(occs_flagged, by = c("species", "coordinate_id", "longitude", "latitude")) %>% - inner_join(env_vars, by = "coordinate_id") %>% - dplyr::select(-coordinate_id) - -save(occs_final, file = "data/r_objects/occs_final.RData") - -# ---------------------------------------------------------------------------# -# Create Modeling dataset #### -# ---------------------------------------------------------------------------# -load("data/r_objects/occs_final.RData") -load("data/r_objects/sa_polygon.RData") -sf::sf_use_s2(use_s2 = FALSE) - -occs_split = split(occs_final, occs_final$species) - -future::plan("multisession", workers = 16) -model_data = furrr::future_map(occs_split, .progress = TRUE, .options = furrr::furrr_options(seed = 42), .f = function(occs_spec){ - # Define model/sampling region - occs_bbox = occs_spec %>% - sf::st_bbox() %>% - expand_bbox(min_span = 1, expansion = 0.25) %>% - sf::st_as_sfc() %>% - st_set_crs(st_crs(occs_spec)) - - sample_region = suppressMessages( - st_as_sf(st_intersection(occs_bbox, sa_polygon)) - ) - - # Sample pseudo absence - sample_points = st_as_sf(st_sample(sample_region, nrow(occs_spec))) %>% - dplyr::mutate( - species = occs_spec$species[1], - longitude = sf::st_coordinates(.)[,1], - latitude = sf::st_coordinates(.)[,2] - ) %>% - dplyr::rename(geometry = "x") - - abs_spec = terra::rast(raster_filepaths) %>% - setNames(paste0("layer_", 1:19)) %>% - terra::extract(sample_points) %>% - dplyr::select(-ID) %>% - dplyr::mutate( - present = 0, - geometry = sample_points$x - ) %>% - tibble() %>% - bind_cols(sample_points) - - # Create presence-absence dataframe - pa_spec = occs_spec %>% - dplyr::mutate(present = 1) %>% - bind_rows(abs_spec) - - # Split into train and test datasets - train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE) - pa_spec$train = 0 - pa_spec$train[train_index] = 1 - - # Define cross-validation folds - folds = tryCatch({ - spatial_folds = suppressMessages( - blockCV::cv_spatial( - pa_spec, - column = "present", - k = 5, - progress = F, plot = F, report = F - ) - ) - - spatial_folds$folds_ids - }, warning = function(w){ - NA - }, error = function(e){ - NA - }) - - pa_spec$folds = folds - pa_spec$geometry = NULL - - return(pa_spec) -}) - -model_data = bind_rows(model_data) -save(model_data, file = "data/r_objects/model_data.RData") -- GitLab