Skip to content
Snippets Groups Projects
Commit c1fa2153 authored by ye87zine's avatar ye87zine
Browse files

Implement glock block CV strategy

parent fab18e93
Branches
No related tags found
No related merge requests found
# General packages # General packages
library(dplyr) library(dplyr)
library(tidyr) library(tidyr)
library(furrr) library(ggplot2)
# Geo packages # Geo packages
library(terra) library(terra)
library(CoordinateCleaner)
library(sf) library(sf)
library(geos) library(geos)
# Stat packages
library(MASS)
library(dismo)
library(spatialEco)
library(blockCV) library(blockCV)
library(caret)
source("R/utils.R") source("R/utils.R")
...@@ -137,22 +130,9 @@ for(species in setdiff(target_species, species_processed)){ ...@@ -137,22 +130,9 @@ for(species in setdiff(target_species, species_processed)){
) %>% ) %>%
bind_rows(abs_spec) bind_rows(abs_spec)
ref = st_bbox(sampling_region)[c(1,3,2,4)]
ggplot() +
ggtitle(species) +
geom_sf(data = sa_polygon) +
geom_sf(data = range_spec, alpha = 0, color = "red") +
geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) +
scale_color_discrete(name = "Presence") +
theme_minimal()
try({
ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf",
width = 8, height = 12)
})
# Define cross-validation folds # Define cross-validation folds
pa_spec_core = dplyr::filter(pa_spec, record_type != "background") pa_spec_core = dplyr::filter(pa_spec, record_type == "core")
pa_spec_background = dplyr::filter(pa_spec, record_type == "background") pa_spec_background = dplyr::filter(pa_spec, record_type == "background")
folds = tryCatch({ folds = tryCatch({
...@@ -171,21 +151,28 @@ for(species in setdiff(target_species, species_processed)){ ...@@ -171,21 +151,28 @@ for(species in setdiff(target_species, species_processed)){
}) })
pa_spec = pa_spec_core %>% pa_spec = pa_spec_core %>%
dplyr::mutate(fold_eval = folds) %>% dplyr::mutate(fold_individual = folds) %>%
bind_rows(pa_spec_background) %>% bind_rows(pa_spec_background) %>%
group_by(present) %>% group_by(present) %>%
mutate(weight = 1/n()) %>% mutate(weight = 1/n()) %>%
ungroup() ungroup()
# Plot result
ref = st_bbox(sampling_region)[c(1,3,2,4)]
ggplot() +
ggtitle(species) +
geom_sf(data = sa_polygon) +
geom_sf(data = range_spec, aes(fill = "IUCN range"), alpha = 0.1, color = NA, show.legend = "fill") +
geom_sf(data = sampling_region, aes(fill = "Sampling region"), alpha = 0.1, color = NA, show.legend = "fill") +
geom_sf(data = pa_spec, aes(color = as.factor(present), shape = as.factor(record_type)), size = 0.7) +
scale_fill_manual(name = "Polygons", values = c("IUCN range" = "red", "Sampling region" = "blue")) +
scale_color_discrete(name = "Presence") +
scale_shape_discrete(name = "Record type") +
theme_minimal()
try({
ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling", device = "pdf", scale = 3)
})
save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData")) save(pa_spec, file = paste0("data/r_objects/pa_sampling/", species, ".RData"))
} }
# Combine presence-absence data
files = list.files("data/r_objects/pa_sampling/", full.names = T)
model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>%
bind_rows() %>%
group_by(species) %>%
dplyr::filter(all(1:5 %in% fold_eval)) %>%
ungroup
save(model_data, file = "data/r_objects/model_data.RData")
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
library(sf)
library(blockCV)
source("R/utils.R")
# Combine presence-absence data
files = list.files("data/r_objects/pa_sampling/", full.names = T)
model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>%
bind_rows()
model_data_core = dplyr::filter(model_data, record_type == "core")
model_data_background = dplyr::filter(model_data, record_type == "background")
# Define global folds
spatial_folds = suppressMessages(
blockCV::cv_spatial(
model_data_core,
column = "present",
k = 5
)
)
model_data_core$fold_global = spatial_folds$folds_ids
model_data = bind_rows(model_data_core, model_data_background) %>%
arrange(species) %>%
relocate(contains("fold"), .after = last_col()) %>%
mutate(species = as.factor(species))
# Explore folds assignment
cv_plot(spatial_folds)
ggsave(filename = "plots/publication/global_folds_map.pdf", device = "pdf", scale = 2.5)
folds_summary = model_data %>%
dplyr::filter(record_type == "core") %>%
sf::st_drop_geometry() %>%
dplyr::group_by(species, fold_global) %>%
dplyr::summarise(
n_occ = length(which(present == 1)),
n_abs = length(which(present == 0))
)
folds_uniform = folds_summary %>%
dplyr::group_by(species, fold_global) %>%
dplyr::summarise(
is_uniform = n_occ == 0 | n_abs == 0
) %>%
dplyr::group_by(species) %>%
dplyr::summarise(
folds_total = n(),
folds_uniform = length(which(is_uniform)),
folds_mixed = folds_total - folds_uniform
) %>%
dplyr::ungroup()
plot_1 = ggplot(folds_uniform, aes(x=folds_total)) +
geom_histogram(binwidth=0.5, fill = "orange") +
labs(title="Number of folds per species", x="Count", y = "Number of species") +
theme_minimal()
plot_2 = ggplot(folds_uniform, aes(x=folds_uniform)) +
geom_histogram(binwidth=0.5, fill = "violet") +
labs(title="Number of uniform folds per species", x="Count", y = "Number of species",
subtitle="Uniform folds contain only absences or only presences and cannot be used in model evaluation.") +
theme_minimal()
plot_3 = ggplot(folds_uniform, aes(x=folds_mixed)) +
geom_histogram(binwidth=0.5, fill = "blue") +
labs(title="Number of mixed folds per species", x="Count", y = "Number of species",
subtitle="Mixed folds both absences and presences and can be used in model evaluation.") +
theme_minimal()
plot_combined = grid.arrange(plot_1, plot_2, plot_3, ncol = 1)
ggsave(filename = "plots/publication/global_folds_histogram.pdf", plot_combined, device = "pdf", height = 10, width = 7)
# Save Model data
save(model_data, file = "data/r_objects/model_data.RData")
\ No newline at end of file
...@@ -4,82 +4,67 @@ library(sf) ...@@ -4,82 +4,67 @@ library(sf)
library(caret) library(caret)
library(cito) library(cito)
library(pROC) library(pROC)
library(parallel)
library(doParallel)
library(foreach)
source("R/utils.R") source("R/utils.R")
load("data/r_objects/model_data.RData") load("data/r_objects/model_data.RData")
# ----------------------------------------------------------------------#
# Run training ####
# ----------------------------------------------------------------------#
species_processed = list.files("data/r_objects/ssdm_results/performance/", pattern = "RData") %>% species_processed = list.files("data/r_objects/ssdm_results/performance/", pattern = "RData") %>%
stringr::str_remove(".RData") stringr::str_remove(".RData")
data_split = model_data %>% data_split = model_data %>%
dplyr::filter(!is.na(fold_eval) & !species %in% species_processed) %>% sf::st_drop_geometry() %>%
dplyr::filter(!species %in% species_processed) %>%
dplyr::group_by(species) %>% dplyr::group_by(species) %>%
dplyr::group_split() dplyr::group_split()
for(pa_spec in data_split){ # ----------------------------------------------------------------------#
# Set up parallel processing ####
# ----------------------------------------------------------------------#
num_cores <- 2
cl <- makeCluster(num_cores)
registerDoParallel(cl)
# ----------------------------------------------------------------------#
# Run training ####
# ----------------------------------------------------------------------#
# Use foreach for parallel processing across species
foreach(pa_spec = data_split, .packages = c("dplyr", "caret", "cito", "pROC", "tidyr")) %dopar% {
na_performance = list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_,
precision = NA_real_, recall = NA_real_, f1 = NA_real_,
tp = NA_real_, fp = NA_real_, tn = NA_real_, fn = NA_real_)
predictors = paste0("bio", 1:19)
species = pa_spec$species[1] species = pa_spec$species[1]
print(species) print(species)
# Define empty result for performance eval
na_performance = list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_,
Precision = NA_real_, Recall = NA_real_, F1 = NA_real_,
TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_)
# Create factor presence column
pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P")) pa_spec$present_fct = factor(pa_spec$present, levels = c("0", "1"), labels = c("A", "P"))
folds = sort(unique(na.omit(pa_spec$fold_global)))
# Outer CV loop (for averaging performance metrics) # Outer CV loop (for averaging performance metrics)
performance_cv = lapply(1:5, function(fold){ performance_cv = lapply(folds, function(fold){
print(paste("Fold", fold)) print(paste("Fold", fold))
## Preparations ##### ## Preparations #####
data_test = dplyr::filter(pa_spec, fold_eval == fold) data_train = dplyr::filter(pa_spec, record_type == "background" | fold_global != fold) # include background samples
data_train = dplyr::filter(pa_spec, is.na(fold_eval) | fold_eval != fold) # include background samples data_test = dplyr::filter(pa_spec, fold_global == fold)
# Create inner CV folds for model training if(all(data_train$present == 0)){ # If only absences in training dataset --> return
folds_train = tryCatch({ return(NULL)
cv_train = blockCV::cv_spatial(
data_train,
column = "present",
k = 5,
progress = F, plot = F, report = F
)
cv_train$folds_ids
}, error = function(e){
NA
})
if(is.numeric(folds_train)){
data_train$fold_train = folds_train
} else {
return()
} }
# Drop geometry
data_train$geometry = NULL
data_test$geometry = NULL
# Define caret training routine
index_train = lapply(unique(sort(data_train$fold_train)), function(x){
return(which(data_train$fold_train != x))
})
train_ctrl = caret::trainControl( train_ctrl = caret::trainControl(
search = "random", search = "random",
classProbs = TRUE, classProbs = TRUE,
index = index_train, number = 5,
summaryFunction = caret::twoClassSummary, summaryFunction = caret::twoClassSummary,
savePredictions = "final" savePredictions = "final"
) )
# Define predictors
predictors = paste0("bio", 1:19)
## Random Forest ##### ## Random Forest #####
rf_performance = tryCatch({ rf_performance = tryCatch({
# Fit model # Fit model
...@@ -87,9 +72,9 @@ for(pa_spec in data_split){ ...@@ -87,9 +72,9 @@ for(pa_spec in data_split){
x = data_train[, predictors], x = data_train[, predictors],
y = data_train$present_fct, y = data_train$present_fct,
method = "ranger", method = "ranger",
metric = "Accuracy",
trControl = train_ctrl, trControl = train_ctrl,
tuneLength = 8, tuneLength = 8,
weights = data_train$weight,
verbose = F verbose = F
) )
...@@ -106,6 +91,7 @@ for(pa_spec in data_split){ ...@@ -106,6 +91,7 @@ for(pa_spec in data_split){
method = "gbm", method = "gbm",
trControl = train_ctrl, trControl = train_ctrl,
tuneLength = 8, tuneLength = 8,
weights = data_train$weight,
verbose = F verbose = F
) )
evaluate_model(gbm_fit, data_test) evaluate_model(gbm_fit, data_test)
...@@ -120,6 +106,7 @@ for(pa_spec in data_split){ ...@@ -120,6 +106,7 @@ for(pa_spec in data_split){
y = data_train$present_fct, y = data_train$present_fct,
method = "gamSpline", method = "gamSpline",
tuneLength = 8, tuneLength = 8,
weights = data_train$weight,
trControl = train_ctrl trControl = train_ctrl
) )
evaluate_model(gam_fit, data_test) evaluate_model(gam_fit, data_test)
...@@ -137,7 +124,7 @@ for(pa_spec in data_split){ ...@@ -137,7 +124,7 @@ for(pa_spec in data_split){
loss = "binomial", loss = "binomial",
epochs = 200L, epochs = 200L,
burnin = 200L, burnin = 200L,
early_stopping = 30, early_stopping = 25,
lr = 0.001, lr = 0.001,
batchsize = min(ceiling(nrow(data_train)/10), 64), batchsize = min(ceiling(nrow(data_train)/10), 64),
dropout = 0.25, dropout = 0.25,
...@@ -157,7 +144,7 @@ for(pa_spec in data_split){ ...@@ -157,7 +144,7 @@ for(pa_spec in data_split){
performance_summary = tibble( performance_summary = tibble(
species = !!species, species = !!species,
obs = nrow(data_train), obs = nrow(data_train),
fold_eval = fold, fold_global = fold,
model = c("rf", "gbm", "gam", "nn"), model = c("rf", "gbm", "gam", "nn"),
auc = c(rf_performance$auc, gbm_performance$auc, gam_performance$auc, nn_performance$auc), auc = c(rf_performance$auc, gbm_performance$auc, gam_performance$auc, nn_performance$auc),
accuracy = c(rf_performance$accuracy, gbm_performance$accuracy, gam_performance$accuracy, nn_performance$accuracy), accuracy = c(rf_performance$accuracy, gbm_performance$accuracy, gam_performance$accuracy, nn_performance$accuracy),
...@@ -170,7 +157,7 @@ for(pa_spec in data_split){ ...@@ -170,7 +157,7 @@ for(pa_spec in data_split){
tn = c(rf_performance$tn, gbm_performance$tn, gam_performance$tn, nn_performance$tn), tn = c(rf_performance$tn, gbm_performance$tn, gam_performance$tn, nn_performance$tn),
fn = c(rf_performance$fn, gbm_performance$fn, gam_performance$fn, nn_performance$fn) fn = c(rf_performance$fn, gbm_performance$fn, gam_performance$fn, nn_performance$fn)
) %>% ) %>%
tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value")
return(performance_summary) return(performance_summary)
}) })
...@@ -179,12 +166,15 @@ for(pa_spec in data_split){ ...@@ -179,12 +166,15 @@ for(pa_spec in data_split){
performance_spec = bind_rows(performance_cv) performance_spec = bind_rows(performance_cv)
if(nrow(performance_spec) != 0){ if(nrow(performance_spec) != 0){
performance_spec = performance_spec %>% performance_spec = performance_spec %>%
dplyr::arrange(fold_eval, model, metric) dplyr::arrange(fold_global, model, metric)
save(performance_spec, file = paste0("data/r_objects/ssdm_results/performance/", species, ".RData")) save(performance_spec, file = paste0("data/r_objects/ssdm_results/performance/", species, ".RData"))
} }
} }
# Stop the parallel cluster when done
stopCluster(cl)
# Combine results # Combine results
files = list.files("data/r_objects/ssdm_results/performance/", full.names = T, pattern = ".RData") files = list.files("data/r_objects/ssdm_results/performance/", full.names = T, pattern = ".RData")
ssdm_performance = lapply(files, function(f){load(f); return(performance_spec)}) %>% ssdm_performance = lapply(files, function(f){load(f); return(performance_spec)}) %>%
...@@ -196,7 +186,6 @@ save(ssdm_performance, file = "data/r_objects/ssdm_performance.RData") ...@@ -196,7 +186,6 @@ save(ssdm_performance, file = "data/r_objects/ssdm_performance.RData")
# Train full models #### # Train full models ####
# ----------------------------------------------------------------------# # ----------------------------------------------------------------------#
data_split = model_data %>% data_split = model_data %>%
dplyr::filter(!is.na(fold_eval)) %>%
dplyr::mutate(present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))) %>% dplyr::mutate(present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))) %>%
dplyr::group_by(species) %>% dplyr::group_by(species) %>%
dplyr::group_split() dplyr::group_split()
...@@ -230,7 +219,7 @@ for(pa_spec in data_split){ ...@@ -230,7 +219,7 @@ for(pa_spec in data_split){
summaryFunction = caret::twoClassSummary, summaryFunction = caret::twoClassSummary,
savePredictions = "final" savePredictions = "final"
) )
# Define predictors # Define predictors
predictors = paste0("bio", 1:19) predictors = paste0("bio", 1:19)
...@@ -240,12 +229,11 @@ for(pa_spec in data_split){ ...@@ -240,12 +229,11 @@ for(pa_spec in data_split){
x = pa_spec[, predictors], x = pa_spec[, predictors],
y = pa_spec$present_fct, y = pa_spec$present_fct,
method = "ranger", method = "ranger",
metric = "Accuracy",
trControl = train_ctrl, trControl = train_ctrl,
tuneLength = 8, tuneLength = 8,
weights = data_train$weight,
verbose = F verbose = F
) )
save(rf_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_rf_fit.RData")) save(rf_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_rf_fit.RData"))
}) })
...@@ -258,6 +246,7 @@ for(pa_spec in data_split){ ...@@ -258,6 +246,7 @@ for(pa_spec in data_split){
method = "gbm", method = "gbm",
trControl = train_ctrl, trControl = train_ctrl,
tuneLength = 8, tuneLength = 8,
weights = data_train$weight,
verbose = F verbose = F
) )
save(gbm_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gbm_fit.RData")) save(gbm_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gbm_fit.RData"))
...@@ -269,8 +258,9 @@ for(pa_spec in data_split){ ...@@ -269,8 +258,9 @@ for(pa_spec in data_split){
x = pa_spec[, predictors], x = pa_spec[, predictors],
y = pa_spec$present_fct, y = pa_spec$present_fct,
method = "gamSpline", method = "gamSpline",
trControl = train_ctrl,
tuneLength = 8, tuneLength = 8,
weights = data_train$weight,
trControl = train_ctrl
) )
save(gam_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gam_fit.RData")) save(gam_fit, file = paste0("data/r_objects/ssdm_results/full_models/", species, "_gam_fit.RData"))
}) })
...@@ -285,9 +275,9 @@ for(pa_spec in data_split){ ...@@ -285,9 +275,9 @@ for(pa_spec in data_split){
loss = "binomial", loss = "binomial",
epochs = 200L, epochs = 200L,
burnin = 200L, burnin = 200L,
early_stopping = 30, early_stopping = 25,
lr = 0.001, lr = 0.001,
batchsize = min(ceiling(nrow(pa_spec)/10), 64), batchsize = min(ceiling(nrow(data_train)/10), 64),
dropout = 0.25, dropout = 0.25,
optimizer = config_optimizer("adam"), optimizer = config_optimizer("adam"),
validation = 0.2, validation = 0.2,
......
...@@ -7,7 +7,6 @@ source("R/utils.R") ...@@ -7,7 +7,6 @@ source("R/utils.R")
load("data/r_objects/model_data.RData") load("data/r_objects/model_data.RData")
model_data = model_data %>% model_data = model_data %>%
dplyr::mutate(species = as.factor(species)) %>%
sf::st_drop_geometry() sf::st_drop_geometry()
# ----------------------------------------------------------------------# # ----------------------------------------------------------------------#
...@@ -19,31 +18,25 @@ formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ...@@ -19,31 +18,25 @@ formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " +
# 1. Cross validation # 1. Cross validation
for(fold in 1:5){ for(fold in 1:5){
# Prepare data # Prepare data
train_data = dplyr::filter(model_data, fold_eval != fold) data_train = model_data %>%
dplyr::filter(record_type == "background" | fold_global != fold)
# Run model # Run model
converged = F msdm_embed_fit = dnn(
while(!converged){ formula,
msdm_embed_fit = dnn( data = data_train,
formula, hidden = c(200L, 200L, 200L),
data = train_data, loss = "binomial",
hidden = c(200L, 200L, 200L), epochs = 5000,
loss = "binomial", lr = 0.001,
epochs = 5000, batchsize = 1024,
lr = 0.001, dropout = 0.25,
batchsize = 4096, burnin = 50,
dropout = 0.25, optimizer = config_optimizer("adam"),
burnin = 50, early_stopping = 100,
optimizer = config_optimizer("adam"), validation = 0.2,
early_stopping = 200, device = "cuda"
validation = 0.2, )
device = "cuda"
)
if(min(msdm_embed_fit$losses$valid_l, na.rm = T) < 0.4){
converged = T
}
}
save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold,".RData")) save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold,".RData"))
} }
...@@ -56,12 +49,11 @@ msdm_embed_fit = dnn( ...@@ -56,12 +49,11 @@ msdm_embed_fit = dnn(
loss = "binomial", loss = "binomial",
epochs = 5000, epochs = 5000,
lr = 0.001, lr = 0.001,
baseloss = 1, batchsize = 1024,
batchsize = 4096,
dropout = 0.25, dropout = 0.25,
burnin = 500, burnin = 50,
optimizer = config_optimizer("adam"), optimizer = config_optimizer("adam"),
early_stopping = 300, early_stopping = 100,
validation = 0.2, validation = 0.2,
device = "cuda" device = "cuda"
) )
...@@ -74,15 +66,15 @@ save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed ...@@ -74,15 +66,15 @@ save(msdm_embed_fit, file = paste0("data/r_objects/msdm_embed_results/msdm_embed
msdm_embed_performance = lapply(1:5, function(fold){ msdm_embed_performance = lapply(1:5, function(fold){
load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData")) load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData"))
test_data_split = model_data %>% data_test_split = model_data %>%
dplyr::filter(fold_eval == fold) %>% dplyr::filter(fold_global == fold) %>%
dplyr::group_split(species) dplyr::group_split(species)
lapply(test_data_split, function(test_data_spec){ lapply(data_test_split, function(data_test_spec){
species = test_data_spec$species[1] species = data_test_spec$species[1]
performance = tryCatch({ performance = tryCatch({
evaluate_model(msdm_embed_fit, test_data_spec) evaluate_model(msdm_embed_fit, data_test_spec)
}, error = function(e){ }, error = function(e){
list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_, list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_,
precision = NA_real_, recall = NA_real_, f1 = NA_real_, precision = NA_real_, recall = NA_real_, f1 = NA_real_,
...@@ -93,11 +85,11 @@ msdm_embed_performance = lapply(1:5, function(fold){ ...@@ -93,11 +85,11 @@ msdm_embed_performance = lapply(1:5, function(fold){
as_tibble() %>% as_tibble() %>%
mutate( mutate(
species = !!species, species = !!species,
obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), obs = nrow(dplyr::filter(model_data, species == !!species, fold_global != !!fold)),
fold_eval = !!fold, fold_global = !!fold,
model = "msdm_embed", model = "msdm_embed",
) %>% ) %>%
tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value")
}) %>% }) %>%
bind_rows() bind_rows()
}) %>% }) %>%
......
...@@ -7,7 +7,6 @@ source("R/utils.R") ...@@ -7,7 +7,6 @@ source("R/utils.R")
load("data/r_objects/model_data.RData") load("data/r_objects/model_data.RData")
model_data = model_data %>% model_data = model_data %>%
dplyr::mutate(species = as.factor(species)) %>%
sf::st_drop_geometry() sf::st_drop_geometry()
# ----------------------------------------------------------------------# # ----------------------------------------------------------------------#
...@@ -19,31 +18,25 @@ formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ...@@ -19,31 +18,25 @@ formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " +
# 1. Cross validation # 1. Cross validation
for(fold in 1:5){ for(fold in 1:5){
# Prepare data # Prepare data
train_data = dplyr::filter(model_data, fold_eval != fold) data_train = model_data %>%
dplyr::filter(record_type == "background" | fold_global != fold)
# Run model # Run model
converged = F msdm_onehot_fit = dnn(
while(!converged){ formula,
msdm_onehot_fit = dnn( data = data_train,
formula, hidden = c(200L, 200L, 200L),
data = train_data, loss = "binomial",
hidden = c(200L, 200L, 200L), epochs = 5000,
loss = "binomial", lr = 0.001,
epochs = 5000, batchsize = 1024,
lr = 0.001, dropout = 0.25,
batchsize = 4096, burnin = 50,
dropout = 0.25, optimizer = config_optimizer("adam"),
burnin = 50, early_stopping = 100,
optimizer = config_optimizer("adam"), validation = 0.2,
early_stopping = 200, device = "cuda"
validation = 0.2, )
device = "cuda"
)
if(min(msdm_onehot_fit$losses$valid_l, na.rm = T) < 0.4){
converged = T
}
}
save(msdm_onehot_fit, file = paste0("data/r_objects/msdm_onehot_results/msdm_onehot_fit_fold", fold,".RData")) save(msdm_onehot_fit, file = paste0("data/r_objects/msdm_onehot_results/msdm_onehot_fit_fold", fold,".RData"))
} }
...@@ -56,12 +49,11 @@ msdm_onehot_fit = dnn( ...@@ -56,12 +49,11 @@ msdm_onehot_fit = dnn(
loss = "binomial", loss = "binomial",
epochs = 5000, epochs = 5000,
lr = 0.001, lr = 0.001,
baseloss = 1, batchsize = 1024,
batchsize = 4096,
dropout = 0.25, dropout = 0.25,
burnin = 500, burnin = 50,
optimizer = config_optimizer("adam"), optimizer = config_optimizer("adam"),
early_stopping = 300, early_stopping = 100,
validation = 0.2, validation = 0.2,
device = "cuda" device = "cuda"
) )
...@@ -74,15 +66,15 @@ save(msdm_onehot_fit, file = paste0("data/r_objects/msdm_onehot_results/msdm_one ...@@ -74,15 +66,15 @@ save(msdm_onehot_fit, file = paste0("data/r_objects/msdm_onehot_results/msdm_one
msdm_onehot_performance = lapply(1:5, function(fold){ msdm_onehot_performance = lapply(1:5, function(fold){
load(paste0("data/r_objects/msdm_onehot_results/msdm_onehot_fit_fold", fold, ".RData")) load(paste0("data/r_objects/msdm_onehot_results/msdm_onehot_fit_fold", fold, ".RData"))
test_data_split = model_data %>% data_test_split = model_data %>%
dplyr::filter(fold_eval == fold) %>% dplyr::filter(fold_global == fold) %>%
dplyr::group_split(species) dplyr::group_split(species)
lapply(test_data_split, function(test_data_spec){ lapply(data_test_split, function(data_test_spec){
species = test_data_spec$species[1] species = data_test_spec$species[1]
performance = tryCatch({ performance = tryCatch({
evaluate_model(msdm_onehot_fit, test_data_spec) evaluate_model(msdm_onehot_fit, data_test_spec)
}, error = function(e){ }, error = function(e){
list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_, list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_,
precision = NA_real_, recall = NA_real_, f1 = NA_real_, precision = NA_real_, recall = NA_real_, f1 = NA_real_,
...@@ -93,11 +85,11 @@ msdm_onehot_performance = lapply(1:5, function(fold){ ...@@ -93,11 +85,11 @@ msdm_onehot_performance = lapply(1:5, function(fold){
as_tibble() %>% as_tibble() %>%
mutate( mutate(
species = !!species, species = !!species,
obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), obs = nrow(dplyr::filter(model_data, species == !!species, fold_global != !!fold)),
fold_eval = !!fold, fold_global = !!fold,
model = "msdm_onehot", model = "msdm_onehot",
) %>% ) %>%
tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value")
}) %>% }) %>%
bind_rows() bind_rows()
}) %>% }) %>%
......
...@@ -5,42 +5,35 @@ library(ranger) ...@@ -5,42 +5,35 @@ library(ranger)
source("R/utils.R") source("R/utils.R")
load("data/r_objects/model_data_random_abs_extended.RData") load("data/r_objects/model_data.RData")
model_data = model_data %>% model_data = model_data %>%
dplyr::filter(!is.na(fold_eval)) %>%
dplyr::mutate( dplyr::mutate(
species = as.factor(species),
present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P")) present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))
) ) %>%
sf::st_drop_geometry()
# ----------------------------------------------------------------------# # ----------------------------------------------------------------------#
# Train model #### # Train model ####
# ----------------------------------------------------------------------# # ----------------------------------------------------------------------#
# Define predictors # Define predictors
predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness", "species") predictors = c(paste0("bio", 1:19), "species")
# Cross validation # Cross validation
for(fold in 1:5){ for(fold in 1:5){
print(paste("Fold", fold))
## Preparations ##### ## Preparations #####
data_train = dplyr::filter(model_data, fold_eval != fold) %>% data_train = model_data %>%
sf::st_drop_geometry() dplyr::filter(record_type == "background" | fold_global != fold)
# Define caret training routine
train_ctrl = caret::trainControl( train_ctrl = caret::trainControl(
method = "cv", search = "random",
number = 5,
classProbs = TRUE, classProbs = TRUE,
number = 5,
summaryFunction = caret::twoClassSummary, summaryFunction = caret::twoClassSummary,
savePredictions = "final" savePredictions = "final"
) )
tune_grid = expand.grid(
mtry = c(2,4,6,8),
splitrule = "gini",
min.node.size = c(1,4,9,16)
)
# Run model # Run model
rf_fit = caret::train( rf_fit = caret::train(
x = data_train[, predictors], x = data_train[, predictors],
...@@ -48,53 +41,46 @@ for(fold in 1:5){ ...@@ -48,53 +41,46 @@ for(fold in 1:5){
method = "ranger", method = "ranger",
metric = "Accuracy", metric = "Accuracy",
trControl = train_ctrl, trControl = train_ctrl,
tuneGrid = tune_grid, tuneLength = 8,
weights = data_train$weight, weights = data_train$weight,
num.threads = 48 num.threads = 48
) )
save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData")) save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold,".RData"))
} }
# Full model # Full model
# Define caret training routine # Define caret training routine
full_data = model_data %>%
sf::st_drop_geometry()
train_ctrl = caret::trainControl( train_ctrl = caret::trainControl(
method = "cv", search = "random",
number = 5,
classProbs = TRUE, classProbs = TRUE,
number = 5,
summaryFunction = caret::twoClassSummary, summaryFunction = caret::twoClassSummary,
savePredictions = "final" savePredictions = "final"
) )
tune_grid = expand.grid(
mtry = c(2,4,6,8),
splitrule = "gini",
min.node.size = c(1,4,9,16)
)
# Run model # Run model
rf_fit = caret::train( rf_fit = caret::train(
x = full_data[, predictors], x = model_data[, predictors],
y = full_data$present_fct, y = model_data$present_fct,
method = "ranger", method = "ranger",
metric = "Accuracy", metric = "Accuracy",
trControl = train_ctrl, trControl = train_ctrl,
tuneGrid = tune_grid tuneLength = 8,
weights = full_data$weight,
num.threads = 48
) )
save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData") save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_full.RData")
# ----------------------------------------------------------------------# # ----------------------------------------------------------------------#
# Evaluate model #### # Evaluate model ####
# ----------------------------------------------------------------------# # ----------------------------------------------------------------------#
msdm_rf_random_abs_extended_performance = lapply(1:5, function(fold){ msdm_rf_performance = lapply(1:5, function(fold){
load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData")) load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold, ".RData"))
test_data = model_data %>% test_data = model_data %>%
dplyr::filter(fold_eval == fold, record_type != "absence_background") %>% dplyr::filter(fold_global == fold) %>%
sf::st_drop_geometry() sf::st_drop_geometry()
actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P")) actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P"))
...@@ -119,36 +105,36 @@ msdm_rf_random_abs_extended_performance = lapply(1:5, function(fold){ ...@@ -119,36 +105,36 @@ msdm_rf_random_abs_extended_performance = lapply(1:5, function(fold){
cm = caret::confusionMatrix(eval_df_spec$preds, eval_df_spec$actual, positive = "P") cm = caret::confusionMatrix(eval_df_spec$preds, eval_df_spec$actual, positive = "P")
list( list(
AUC = as.numeric(auc), auc = as.numeric(auc),
Accuracy = cm$overall["Accuracy"], accuracy = cm$overall["Accuracy"],
Kappa = cm$overall["Kappa"], kappa = cm$overall["Kappa"],
Precision = cm$byClass["Precision"], precision = cm$byClass["Precision"],
Recall = cm$byClass["Recall"], recall = cm$byClass["Recall"],
F1 = cm$byClass["F1"], f1 = cm$byClass["F1"],
TP = cm$table["P", "P"], tp = cm$table["P", "P"],
FP = cm$table["P", "A"], fp = cm$table["P", "A"],
TN = cm$table["A", "A"], tn = cm$table["A", "A"],
FN = cm$table["A", "P"] fn = cm$table["A", "P"]
) )
}, error = function(e){ }, error = function(e){
list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, list(auc = NA_real_, accuracy = NA_real_, kappa = NA_real_,
Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, precision = NA_real_, recall = NA_real_, f1 = NA_real_,
TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_) tp = NA_real_, fp = NA_real_, tn = NA_real_, fn = NA_real_)
}) })
performance_summary = performance %>% performance_summary = performance %>%
as_tibble() %>% as_tibble() %>%
mutate( mutate(
species = !!species, species = !!species,
obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), obs = nrow(dplyr::filter(model_data, species == !!species, fold_global != !!fold)),
fold_eval = !!fold, fold_global = !!fold,
model = "MSDM_rf_random_abs_extended", model = "msdm_rf",
) %>% ) %>%
tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") %>% tidyr::pivot_longer(-any_of(c("species", "obs", "fold_global", "model")), names_to = "metric", values_to = "value") %>%
drop_na() drop_na()
}) %>% }) %>%
bind_rows() bind_rows()
}) %>% }) %>%
bind_rows() bind_rows()
save(msdm_rf_random_abs_extended_performance, file = paste0("data/r_objects/msdm_rf_random_abs_extended_performance.RData")) save(msdm_rf_performance, file = paste0("data/r_objects/msdm_rf_performance.RData"))
library(dplyr)
library(tidyr)
library(caret)
library(ranger)
source("R/utils.R")
load("data/r_objects/model_data_random_abs.RData")
model_data = model_data %>%
dplyr::filter(!is.na(fold_eval)) %>%
dplyr::mutate(
species = as.factor(species),
present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))
)
predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness", "species")
full_data = model_data %>%
sf::st_drop_geometry()
train_ctrl = caret::trainControl(
search = "random",
classProbs = TRUE,
summaryFunction = caret::twoClassSummary,
savePredictions = "final"
)
# Run model
rf_fit = caret::train(
x = full_data[, predictors],
y = full_data$present_fct,
method = "ranger",
metric = "Accuracy",
trControl = train_ctrl,
tuneLength = 1,
num.threads = 48,
importance = 'impurity',
verbose = T
)
save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
varimp = varImp(rf_fit)
# Cross validation
for(fold in 1:5){
## Preparations #####
data_train = dplyr::filter(model_data, fold_eval != fold) %>%
sf::st_drop_geometry()
# Define caret training routine
train_ctrl = caret::trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = caret::twoClassSummary,
savePredictions = "final"
)
tune_grid = expand.grid(
mtry = c(2,4,6,8),
splitrule = "gini",
min.node.size = c(1,4,9,16)
)
# Run model
rf_fit = caret::train(
x = data_train[, predictors],
y = data_train$present_fct,
method = "ranger",
metric = "Accuracy",
trControl = train_ctrl,
tuneGrid = tune_grid,
num.threads = 48,
verbose = F
)
save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData"))
}
# ----------------------------------------------------------------------#
# Evaluate model ####
# ----------------------------------------------------------------------#
msdm_rf_random_abs_performance = lapply(1:5, function(fold){
load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData"))
test_data = dplyr::filter(model_data, fold_eval == fold) %>%
sf::st_drop_geometry()
actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P"))
probs = predict(rf_fit, test_data, type = "prob")$P
preds = predict(rf_fit, test_data, type = "raw")
eval_dfs = data.frame(
species = test_data$species,
actual,
probs,
preds
) %>%
group_by(species) %>%
group_split()
lapply(eval_dfs, function(eval_df_spec){
species = eval_df_spec$species[1]
performance = tryCatch({
auc = pROC::roc(eval_df_spec$actual, eval_df_spec$probs, levels = c("P", "A"), direction = ">")$auc
cm = caret::confusionMatrix(eval_df_spec$preds, eval_df_spec$actual, positive = "P")
list(
AUC = as.numeric(auc),
Accuracy = cm$overall["Accuracy"],
Kappa = cm$overall["Kappa"],
Precision = cm$byClass["Precision"],
Recall = cm$byClass["Recall"],
F1 = cm$byClass["F1"],
TP = cm$table["P", "P"],
FP = cm$table["P", "A"],
TN = cm$table["A", "A"],
FN = cm$table["A", "P"]
)
}, error = function(e){
list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_,
Precision = NA_real_, Recall = NA_real_, F1 = NA_real_,
TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_)
})
performance_summary = performance %>%
as_tibble() %>%
mutate(
species = !!species,
obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
fold_eval = !!fold,
model = "MSDM_rf_random_abs",
) %>%
tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value")
}) %>%
bind_rows()
}) %>%
bind_rows()
save(msdm_rf_random_abs_performance, file = paste0("data/r_objects/msdm_rf_random_abs_performance.RData"))
...@@ -12,9 +12,9 @@ library(plotly) ...@@ -12,9 +12,9 @@ library(plotly)
library(DT) library(DT)
load("../data/r_objects/model_data.RData") load("../data/r_objects/model_data.RData")
load("../data/r_objects/ssdm_performance.RData") #load("../data/r_objects/ssdm_performance.RData")
load("../data/r_objects/msdm_embed_performance.RData") load("../data/r_objects/msdm_embed_performance.RData")
load("../data/r_objects/msdm_onehot_performance.RData") #load("../data/r_objects/msdm_onehot_performance.RData")
load("../data/r_objects/msdm_rf_performance.RData") load("../data/r_objects/msdm_rf_performance.RData")
load("../data/r_objects/range_maps.RData") load("../data/r_objects/range_maps.RData")
...@@ -56,16 +56,12 @@ loess_fit = function(x, y, span = 0.75){ ...@@ -56,16 +56,12 @@ loess_fit = function(x, y, span = 0.75){
} }
# Performance table # Performance table
performance = ssdm_results %>% performance = msdm_embed_performance %>%
bind_rows(msdm_embed_performance) %>% #bind_rows(msdm_embed_performance) %>%
bind_rows(msdm_onehot_performance) %>% #bind_rows(msdm_onehot_performance) %>%
bind_rows(msdm_rf_performance) %>% bind_rows(msdm_rf_performance) %>%
dplyr::mutate( # TODO: remove when only working with new data
metric = stringr::str_to_lower(metric),
model = stringr::str_to_lower(model)
) %>%
dplyr::group_by(species, model, metric) %>% dplyr::group_by(species, model, metric) %>%
dplyr::summarise(value = mean(value, na.rm = F)) %>% dplyr::summarise(value = mean(value, na.rm = T)) %>%
dplyr::mutate( dplyr::mutate(
value = case_when( value = case_when(
((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5, ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5,
......
This diff is collapsed.
...@@ -13,10 +13,11 @@ library(cito) ...@@ -13,10 +13,11 @@ library(cito)
source("R/utils.R") source("R/utils.R")
load("data/r_objects/model_data_random_abs_extended.RData") load("data/r_objects/model_data.RData")
load("data/r_objects/ssdm_results.RData") load("data/r_objects/ssdm_performance.RData")
load("data/r_objects/msdm_embed_performance.RData") load("data/r_objects/msdm_embed_performance.RData")
load("data/r_objects/msdm_onehot_performance.RData") load("data/r_objects/msdm_onehot_performance.RData")
load("data/r_objects/msdm_rf_performance.RData")
load("data/r_objects/functional_groups.RData") load("data/r_objects/functional_groups.RData")
load("data/r_objects/sa_polygon.RData") load("data/r_objects/sa_polygon.RData")
...@@ -25,26 +26,27 @@ load("data/r_objects/range_maps.RData") ...@@ -25,26 +26,27 @@ load("data/r_objects/range_maps.RData")
sf::sf_use_s2(use_s2 = FALSE) sf::sf_use_s2(use_s2 = FALSE)
model_data = model_data %>% model_data = model_data %>%
dplyr::filter(!is.na(fold_eval)) %>%
dplyr::mutate(species = as.factor(species)) dplyr::mutate(species = as.factor(species))
# ------------------------------------------------------------------ # # ------------------------------------------------------------------ #
# 1. Collect performance results #### # 1. Collect performance results ####
# ------------------------------------------------------------------ # # ------------------------------------------------------------------ #
performance = ssdm_results %>% performance = ssdm_performance %>%
bind_rows(msdm_embed_performance) %>% bind_rows(msdm_embed_performance) %>%
bind_rows(msdm_onehot_performance) %>% bind_rows(msdm_onehot_performance) %>%
dplyr::group_by(species, model, metric) %>% bind_rows(msdm_rf_performance) %>%
dplyr::summarise(value = mean(value, na.rm = F)) %>%
dplyr::mutate( dplyr::mutate(
metric = factor(tolower(metric), levels = c("auc", "f1", "kappa", "accuracy", "precision", "recall")), metric = factor(tolower(metric), levels = c("auc", "f1", "kappa", "accuracy", "precision", "recall")),
model = factor(model, levels = c("GAM", "GBM", "RF", "NN", "MSDM_onehot", "MSDM_embed")), model = factor(model, levels = c("gam", "gbm", "rf", "nn", "msdm_onehot", "msdm_embed", "msdm_rf")),
value = case_when( value = case_when(
((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5, ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5,
((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0, ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0,
.default = value .default = value
) )
) ) %>%
dplyr::group_by(species, model, metric) %>%
dplyr::summarise(value = mean(value, na.rm = F)) %>%
ungroup()
# ------------------------------------------------------------------ # # ------------------------------------------------------------------ #
# 2. Model comparison #### # 2. Model comparison ####
...@@ -81,13 +83,13 @@ obs_count = model_data %>% ...@@ -81,13 +83,13 @@ obs_count = model_data %>%
sf::st_drop_geometry() %>% sf::st_drop_geometry() %>%
dplyr::filter(present == 1, !is.na(fold_eval)) %>% dplyr::filter(present == 1, !is.na(fold_eval)) %>%
dplyr::group_by(species) %>% dplyr::group_by(species) %>%
dplyr::summarise(obs = n()) dplyr::summarise(n_obs = n())
df_plot = performance %>% df_plot = performance %>%
dplyr::filter(metric %in% c("auc", "f1", "kappa", "accuracy", "precision", "recall")) %>% dplyr::filter(metric %in% c("auc", "f1", "kappa", "accuracy", "precision", "recall")) %>%
dplyr::left_join(obs_count, by = "species") dplyr::left_join(obs_count, by = "species")
ggplot(df_plot, aes(x = obs, y = value, color = model, fill = model)) + ggplot(df_plot, aes(x = n_obs, y = value, color = model, fill = model)) +
geom_point(alpha = 0.1) + geom_point(alpha = 0.1) +
geom_smooth() + geom_smooth() +
facet_wrap(~ metric, scales = "free_y") + facet_wrap(~ metric, scales = "free_y") +
...@@ -157,18 +159,24 @@ library(ranger) ...@@ -157,18 +159,24 @@ library(ranger)
plot_predictions = function(spec, model_data, raster_data, algorithms){ plot_predictions = function(spec, model_data, raster_data, algorithms){
# Species data # Species data
load("data/r_objects/range_maps.RData") load("data/r_objects/range_maps.RData")
load("data/r_objects/sa_polygon.RData")
pa_spec = model_data %>% pa_spec = model_data %>%
dplyr::filter(species == !!spec) dplyr::filter(species == !!spec, record_type == "core")
range_spec = range_maps %>% range_spec = range_maps %>%
dplyr::filter(name_matched == !!spec) %>% dplyr::filter(name_matched == !!spec) %>%
sf::st_transform(sf::st_crs(pa_spec)) sf::st_transform(sf::st_crs(pa_spec))
sa_polygon = sa_polygon %>%
sf::st_transform(crs(raster_data, proj = T))
# Extract raster values into df # Extract raster values into df
bbox_spec = sf::st_bbox(range_spec) %>% bbox_spec = sf::st_bbox(range_spec) %>%
expand_bbox(expansion = 0.25) expand_bbox(expansion = 0.5)
raster_crop = raster_data %>%
terra::crop(bbox_spec) %>%
terra::mask(vect(sa_polygon))
raster_crop = terra::crop(raster_data, bbox_spec)
pa_crop = pa_spec[st_intersects(pa_spec, st_as_sfc(bbox_spec), sparse = FALSE),] pa_crop = pa_spec[st_intersects(pa_spec, st_as_sfc(bbox_spec), sparse = FALSE),]
new_data = raster_crop %>% new_data = raster_crop %>%
...@@ -191,15 +199,6 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){ ...@@ -191,15 +199,6 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){
} else if(algorithm == "msdm_rf"){ } else if(algorithm == "msdm_rf"){
load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData") load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData")
predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48) predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
} else if(algorithm == "msdm_rf_random_abs"){
load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
} else if(algorithm == "msdm_rf_fit_no_species"){
load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_full.RData")
predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
} else if(algorithm == "msdm_rf_fit_no_species_random_abs"){
load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_random_abs_full.RData")
predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
} else if(algorithm == "nn") { } else if(algorithm == "nn") {
load(paste0("data/r_objects/ssdm_results/full_models/", spec, "_nn_fit.RData")) load(paste0("data/r_objects/ssdm_results/full_models/", spec, "_nn_fit.RData"))
new_data_tmp = dplyr::select(new_data, -species) new_data_tmp = dplyr::select(new_data, -species)
...@@ -218,7 +217,7 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){ ...@@ -218,7 +217,7 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){
p = ggplot() + p = ggplot() +
tidyterra::geom_spatraster(data = as.factor(raster_pred), maxcell = 5e7) + tidyterra::geom_spatraster(data = as.factor(raster_pred), maxcell = 5e7) +
scale_fill_manual(values = c("0" = "black", "1" = "yellow"), name = "Prediction", na.translate = FALSE) + scale_fill_manual(values = c("0" = "black", "1" = "yellow"), name = "Prediction", na.translate = FALSE) +
geom_sf(data = pa_crop, aes(shape = as.factor(present)), color = "#FFFFFF99") + geom_sf(data = pa_crop, aes(shape = as.factor(present)), color = "#FF000088") +
geom_sf(data = range_spec, col = "red", fill = NA) + geom_sf(data = range_spec, col = "red", fill = NA) +
scale_shape_manual(values = c("0" = 1, "1" = 4), name = "Observation") + scale_shape_manual(values = c("0" = 1, "1" = 4), name = "Observation") +
theme_minimal() + theme_minimal() +
...@@ -236,16 +235,17 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){ ...@@ -236,16 +235,17 @@ plot_predictions = function(spec, model_data, raster_data, algorithms){
# Load raster # Load raster
raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>%
stringr::str_subset("CHELSA_bio") %>%
stringr::str_sort(numeric = T) stringr::str_sort(numeric = T)
raster_data = terra::rast(raster_filepaths) %>% raster_data = terra::rast(raster_filepaths) %>%
terra::crop(sf::st_bbox(sa_polygon)) %>% terra::crop(sf::st_bbox(sa_polygon)) %>%
terra::project(sf::st_crs(model_data)$input) terra::project(sf::st_crs(model_data)$input)
specs = sort(sample(levels(model_data$species), 4)) specs = c("Histiotus velatus", "Hydrochoerus hydrochaeris", "Euryoryzomys legatus", "Eumops trumbulli")
for(spec in specs){ for(spec in specs){
pdf(file = paste0("plots/range_predictions/", spec, " (msdm_rf).pdf"), width = 12) pdf(file = paste0("plots/range_predictions/", spec, ".pdf"), width = 12)
plots = plot_predictions(spec, model_data, raster_data, algorithms = c("msdm_rf", "msdm_rf_random_abs", "msdm_rf_fit_no_species", "msdm_rf_fit_no_species_random_abs")) plots = plot_predictions(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "msdm_onehot", "msdm_embed", "msdm_rf"))
lapply(plots, print) lapply(plots, print)
dev.off() dev.off()
} }
...@@ -256,7 +256,6 @@ for(spec in specs){ ...@@ -256,7 +256,6 @@ for(spec in specs){
load("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData") load("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData")
load("data/r_objects/msdm_onehot_results/msdm_onehot_fit_full.RData") load("data/r_objects/msdm_onehot_results/msdm_onehot_fit_full.RData")
load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData") load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData")
load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
compare_species_predictions = function(model, sample_size){ compare_species_predictions = function(model, sample_size){
specs = sample(unique(model_data$species), 2, replace = F) specs = sample(unique(model_data$species), 2, replace = F)
...@@ -329,7 +328,7 @@ obs_count = model_data %>% ...@@ -329,7 +328,7 @@ obs_count = model_data %>%
sf::st_drop_geometry() %>% sf::st_drop_geometry() %>%
dplyr::filter(present == 1, !is.na(fold_eval)) %>% dplyr::filter(present == 1, !is.na(fold_eval)) %>%
dplyr::group_by(species) %>% dplyr::group_by(species) %>%
dplyr::summarise(obs = n()) dplyr::summarise(n_obs = n())
all_embeddings = lapply(1:5, function(fold){ all_embeddings = lapply(1:5, function(fold){
load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData")) load(paste0("data/r_objects/msdm_embed_results/msdm_embed_fit_fold", fold, ".RData"))
...@@ -377,7 +376,7 @@ r_embed_df = data.frame( ...@@ -377,7 +376,7 @@ r_embed_df = data.frame(
df_plot = obs_count %>% df_plot = obs_count %>%
dplyr::left_join(r_embed_df, by = "species") dplyr::left_join(r_embed_df, by = "species")
ggplot(data = df_plot, aes(x = obs, y = value)) + ggplot(data = df_plot, aes(x = n_obs, y = value)) +
geom_point() + geom_point() +
geom_smooth() + geom_smooth() +
labs(title = "Mean and variance of correlation coefficient of species embeddings across folds") + labs(title = "Mean and variance of correlation coefficient of species embeddings across folds") +
...@@ -557,3 +556,5 @@ range_dist_subset = range_dist[species_intersect, species_intersect] ...@@ -557,3 +556,5 @@ range_dist_subset = range_dist[species_intersect, species_intersect]
e_indices = match(species_intersect, species_lookup$species) e_indices = match(species_intersect, species_lookup$species)
embeddings_dist_subset = as.matrix(embeddings_dist)[e_indices, e_indices] embeddings_dist_subset = as.matrix(embeddings_dist)[e_indices, e_indices]
FactoMineR::coeffRV(embeddings_dist_subset, range_dist_subset) FactoMineR::coeffRV(embeddings_dist_subset, range_dist_subset)
# --> Significant correlation between embeddings and pairwise species distances (range > func > phylo)
...@@ -2,3 +2,9 @@ ...@@ -2,3 +2,9 @@
quarto-pub: quarto-pub:
- id: 77b30762-b473-447d-be18-d4bbd6261fbf - id: 77b30762-b473-447d-be18-d4bbd6261fbf
url: 'https://chrkoenig.quarto.pub/sdm-performance-report' url: 'https://chrkoenig.quarto.pub/sdm-performance-report'
- id: 7777d450-82af-4364-b4c1-b52d7139ade0
url: 'https://chrkoenig.quarto.pub/rf-nospec-performance-report'
- source: 05_01_performance_report_rf.qmd
quarto-pub:
- id: 98e776a5-fb0b-41f9-8733-446bf7ffb272
url: 'https://chrkoenig.quarto.pub/sdm-performance-report-rf'
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment