Skip to content
Snippets Groups Projects
Commit c33c0424 authored by ye87zine's avatar ye87zine
Browse files

protoyped kde based absence sampling

parent cbc47e17
No related branches found
No related tags found
1 merge request!1Presence absence sampling
......@@ -20,14 +20,8 @@ sf::sf_use_s2(use_s2 = FALSE) # Don't use spherical geometry
# ---------------------------------------------------------------------------#
# Prepare Geodata ####
# ---------------------------------------------------------------------------#
raster_info = tbl(con, "datasets") %>%
dplyr::filter(stringr::str_detect(name, "CHELSA")) %>%
collect()
# raster_filepaths = list.files(raster_info$raw_data_path, pattern = ".tif$", full.names = T) %>%
# stringr::str_sort(numeric = T)
raster_filepaths = list.files("I:/mas_data/00_data/processed/CHELSA_v2-1_bioclim", pattern = ".tif$", full.names = T) %>%
# TODO: finalize predictor choice
raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>%
stringr::str_sort(numeric = T)
sa_polygon = rnaturalearth::ne_countries() %>%
......@@ -40,6 +34,7 @@ sa_polygon = rnaturalearth::ne_countries() %>%
load("data/r_objects/range_maps.RData")
target_species = unique(range_maps$name_matched[!is.na(range_maps$name_matched)])
# Query species from Symobio
occs = tbl(con, "species_occurrences") %>%
dplyr::filter(species %in% target_species) %>%
dplyr::select(-year) %>%
......@@ -48,6 +43,7 @@ occs = tbl(con, "species_occurrences") %>%
sf::st_as_sf(coords = c("longitude", "latitude"), remove = F, crs = sf::st_crs(4326)) %>%
sf::st_filter(sa_polygon)
# Flag occurrences using Coordinate cleaner
occs_flagged = occs %>%
dplyr::distinct(species, coordinate_id, longitude, latitude) %>%
group_by(species) %>%
......@@ -64,97 +60,9 @@ occs_flagged = occs %>%
dplyr::filter(.summary == T) %>%
dplyr::select(species, coordinate_id, longitude, latitude)
env_vars = tbl(con, "raster_extracts_num") %>%
dplyr::filter(
coordinate_id %in% occs$coordinate_id,
metric == "mean"
) %>%
arrange(raster_layer_id) %>%
tidyr::pivot_wider(id_cols = coordinate_id, names_from = raster_layer_id, names_prefix = "layer_", values_from = value) %>%
collect()
# Finalize and save occurrence dataset
occs_final = occs %>%
inner_join(occs_flagged, by = c("species", "coordinate_id", "longitude", "latitude")) %>%
inner_join(env_vars, by = "coordinate_id") %>%
dplyr::select(-coordinate_id)
save(occs_final, file = "data/r_objects/occs_final.RData")
# ---------------------------------------------------------------------------#
# Create Modeling dataset ####
# ---------------------------------------------------------------------------#
load("data/r_objects/occs_final.RData")
load("data/r_objects/sa_polygon.RData")
sf::sf_use_s2(use_s2 = FALSE)
occs_split = split(occs_final, occs_final$species)
future::plan("multisession", workers = 16)
model_data = furrr::future_map(occs_split, .progress = TRUE, .options = furrr::furrr_options(seed = 42), .f = function(occs_spec){
# Define model/sampling region
occs_bbox = occs_spec %>%
sf::st_bbox() %>%
expand_bbox(min_span = 1, expansion = 0.25) %>%
sf::st_as_sfc() %>%
st_set_crs(st_crs(occs_spec))
sample_region = suppressMessages(
st_as_sf(st_intersection(occs_bbox, sa_polygon))
)
# Sample pseudo absence
sample_points = st_as_sf(st_sample(sample_region, nrow(occs_spec))) %>%
dplyr::mutate(
species = occs_spec$species[1],
longitude = sf::st_coordinates(.)[,1],
latitude = sf::st_coordinates(.)[,2]
) %>%
dplyr::rename(geometry = "x")
abs_spec = terra::rast(raster_filepaths) %>%
setNames(paste0("layer_", 1:19)) %>%
terra::extract(sample_points) %>%
dplyr::select(-ID) %>%
dplyr::mutate(
present = 0,
geometry = sample_points$x
) %>%
tibble() %>%
bind_cols(sample_points)
# Create presence-absence dataframe
pa_spec = occs_spec %>%
dplyr::mutate(present = 1) %>%
bind_rows(abs_spec)
# Split into train and test datasets
train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE)
pa_spec$train = 0
pa_spec$train[train_index] = 1
# Define cross-validation folds
folds = tryCatch({
spatial_folds = suppressMessages(
blockCV::cv_spatial(
pa_spec,
column = "present",
k = 5,
progress = F, plot = F, report = F
)
)
spatial_folds$folds_ids
}, warning = function(w){
NA
}, error = function(e){
NA
})
pa_spec$folds = folds
pa_spec$geometry = NULL
return(pa_spec)
})
model_data = bind_rows(model_data)
save(model_data, file = "data/r_objects/model_data.RData")
save(occs_final, file = "data/r_objects/occs_final.RData")
\ No newline at end of file
# General packages
library(dplyr)
library(tidyr)
library(furrr)
# Geo packages
library(terra)
library(CoordinateCleaner)
library(sf)
library(MASS)
library(geos)
# DB packages
library(Symobio)
library(DBI)
source("R/utils.R")
load("data/r_objects/range_maps.RData")
load("data/r_objects/occs_final.RData")
# ---------------------------------------------------------------------------#
# Prepare Geodata ####
# ---------------------------------------------------------------------------#
target_species = unique(occs_final$species)
sa_polygon = rnaturalearth::ne_countries() %>%
dplyr::filter(continent == "South America") %>%
sf::st_union()
raster_filepaths = list.files("~/share/groups/mas_data/00_data/processed/CHELSA_v2-1_bioclim/", pattern = ".tif$", full.names = T) %>% # TODO: finalize predictor choice
stringr::str_sort(numeric = T)
raster_data = terra::rast(raster_filepaths) %>%
terra::crop(sf::st_bbox(sa_polygon))
# Project (proj string generated with SA bbox coordinates on https://projectionwizard.org)
proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs"
raster_data = terra::project(raster_data, proj_string)
sa_polygon = st_transform(sa_polygon, proj_string)
occs_final = st_transform(occs_final, proj_string)
range_maps = st_transform(range_maps, proj_string)
# ---------------------------------------------------------------------------#
# 1. Distance-based sampling ####
# #
# Absences should be predominantly sampled from regions that are #
# geographically close (reproduce sampling biases) but environmentally #
# dissimilar (avoid false negatives) to the known occurrences #
# ---------------------------------------------------------------------------#
# Geographic distance
future::plan("multisession", workers = 16)
pseudo_absences = furrr::future_map(
.x = target_species,
.options = furrr::furrr_options(seed = 42),
.f = function(species){
# Prepare occurrences
occs_spec = dplyr::filter(occs_final, species == !!species)
occs_poly = occs_spec %>%
summarise(geometry = st_combine(geometry)) %>%
st_concave_hull(ratio = 0.5)
occs_bff = occs_spec %>%
st_buffer(10000) %>% # Buffer by 10 kilometers as exclusion radius for absence sampling
st_union()
range_spec = dplyr::filter(range_maps, name_matched == !!species) %>%
rename(species = name_matched)
# Define model/sampling region
sampling_region = st_union(range_spec, occs_poly) %>% # Union occs poly and expert range
st_buffer(dist = 100000) %>% # Buffer by 100 km to extend sampling region into new environments
st_intersection(sa_polygon) %>% # Crop to South America
st_difference(occs_bff) # Crop out buffered presences
# Define KDE
ref = st_bbox(sampling_region)[c(1,3,2,4)]
min_extent = min(ref[2]-ref[1], ref[4]-ref[3])
occs_kde = spatialEco::sf.kde(occs_spec, bw = min_extent/2, res = 10000, ref = ref, standardize = T, scale.factor = 1) %>%
crop(sampling_region, mask = T, touches = F)
# Sampling of pseudo absence
abs_spec = st_sf(geometry = st_sfc(), crs = proj_string)
while(nrow(abs_spec) < nrow(occs_spec)){
sample_points_raw = st_sample(sampling_region, nrow(occs_spec))
p = extract(occs_kde, vect(sample_points_raw))$z
x = runif(length(p))
sample_points = sample_points_raw[x<p]
samples_required = nrow(occs_spec) - nrow(abs_spec)
if(length(sample_points) > samples_required){
sample_points = sample(sample_points, samples_required)
}
abs_spec = bind_rows(abs_spec, st_sf(geometry = sample_points))
}
plot(sa_polygon)
plot(occs_kde, add = T, legend = F)
plot(range_spec, add = T, col = "#00000000")
plot(abs_spec, add = T, col = "black", pch = 3, cex = 0.5)
plot(occs_spec, add = T, col = "white", pch = 3, cex = 0.5)
sample_points = st_as_sf(st_sample(sample_region, nrow(abs_spec))) %>%
dplyr::mutate(
species = occs_spec$species[1],
longitude = sf::st_coordinates(.)[,1],
latitude = sf::st_coordinates(.)[,2]
) %>%
dplyr::rename(geometry = "x")
abs_spec = terra::rast(raster_filepaths) %>%
setNames(paste0("layer_", 1:19)) %>%
terra::extract(sample_points) %>%
dplyr::select(-ID) %>%
dplyr::mutate(
present = 0,
geometry = sample_points$x
) %>%
tibble() %>%
bind_cols(sample_points)
# Create presence-absence dataframe
pa_spec = occs_spec %>%
dplyr::mutate(present = 1) %>%
bind_rows(abs_spec)
# Split into train and test datasets
train_index = createDataPartition(pa_spec$present, p = 0.7, list = FALSE)
pa_spec$train = 0
pa_spec$train[train_index] = 1
# Define cross-validation folds
folds = tryCatch({
spatial_folds = suppressMessages(
blockCV::cv_spatial(
pa_spec,
column = "present",
k = 5,
progress = F, plot = F, report = F
)
)
spatial_folds$folds_ids
}, warning = function(w){
NA
}, error = function(e){
NA
})
pa_spec$folds = folds
pa_spec$geometry = NULL
return(pa_spec)
})
model_data = bind_rows(model_data)
save(model_data, file = "data/r_objects/model_data.RData")
ggplot() +
geom_sf(data = sa_polygon, fill = "red") +
geom_sf(data = range_spec, fill = "green") +
geom_sf(data = occs_spec) +
# facet_wrap("species", nrow = 2) +
# theme_minimal()
#
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment