Skip to content
Snippets Groups Projects
Commit 298d7d6e authored by ye87zine's avatar ye87zine
Browse files

rf experiments

parent 643e472f
Branches
No related tags found
No related merge requests found
# General packages
library(dplyr)
library(tidyr)
library(furrr)
# Geo packages
library(terra)
library(CoordinateCleaner)
library(sf)
library(geos)
# Stat packages
library(MASS)
library(dismo)
library(spatialEco)
library(blockCV)
library(caret)
source("R/utils.R")
load("data/r_objects/range_maps.RData")
load("data/r_objects/occs_final.RData")
# ---------------------------------------------------------------------------#
# Prepare Geodata ####
# ---------------------------------------------------------------------------#
proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs"
target_species = unique(occs_final$species)
sa_polygon = rnaturalearth::ne_countries() %>%
dplyr::filter(continent == "South America") %>%
sf::st_union()
raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>%
stringr::str_sort(numeric = T)
sa_polygon = st_transform(sa_polygon, proj_string)
occs_final = st_transform(occs_final, proj_string)
range_maps = st_transform(range_maps, proj_string)
raster_data = terra::rast(raster_filepaths) %>%
terra::project(proj_string)
# ---------------------------------------------------------------------------#
# Sample random absences ####
# #
# Absences are sampled at random #
# (1) within the defined sampling area (same number as presences) #
# (2) outside the defined sampling area (100 records) #
# ---------------------------------------------------------------------------#
species_processed = list.files("data/r_objects/pa_sampling_random_extended/", pattern = "RData") %>%
stringr::str_remove(".RData")
for(species in setdiff(target_species, species_processed)){
print(species)
# Prepare species occurrence data #####
occs_spec = dplyr::filter(occs_final, species == !!species)
if(nrow(occs_spec) <= 1){
warning("Skipping species ", species, ": Can't process species with a single presence record." )
}
occs_multipoint = occs_spec %>%
summarise(geometry = st_combine(geometry))
range_spec = dplyr::filter(range_maps, name_matched == !!species) %>%
rename(species = name_matched)
# Define core sampling region
# 1. Union all points from range polygon and occurrences records
# 2. Build concave polygon from unioned points
# 3. Buffer by 50 km to extend sampling region into new environments
# 4. Crop by South America outline
# 5. Crop by buffered presence points
sampling_region = st_cast(range_spec, "MULTIPOINT") %>%
st_union(occs_multipoint) %>%
geos::geos_concave_hull(ratio = 0.3) %>% # Concave hull in sf requires geos 3.11
st_as_sf() %>%
st_set_crs(proj_string) %>%
st_buffer(dist = 50000) %>%
st_intersection(sa_polygon) %>%
st_difference(st_buffer(occs_multipoint, dist = 50000))
# Sample balanced number of pseudo absences inside core region
abs_spec_list = list()
samples_required = nrow(occs_spec)
while(samples_required != 0){
sample_points = sf::st_sample(sampling_region, size = samples_required)
env_sample = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE)
sample_abs = st_sf(env_sample, geometry = sample_points) %>%
drop_na() %>%
dplyr::mutate(
record_type = "absence_core"
)
abs_spec_list[[length(abs_spec_list)+1]] = sample_abs
samples_required = samples_required - nrow(sample_abs) # Sometimes there are no env data for sample points, so keep sampling
}
# Sample 100 pseudo absences outside core region
background_region = st_difference(sa_polygon, sampling_region)
samples_required = 100
while(samples_required != 0){
sample_points = sf::st_sample(background_region, size = samples_required)
env_sample = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE)
sample_abs = st_sf(env_sample, geometry = sample_points) %>%
drop_na() %>%
dplyr::mutate(
record_type = "absence_background"
)
abs_spec_list[[length(abs_spec_list)+1]] = sample_abs
samples_required = samples_required - nrow(sample_abs) # Sometimes there are no env data for sample points, so keep sampling
}
# Consolidate absences
abs_spec = bind_rows(abs_spec_list)
abs_spec = abs_spec %>%
bind_cols(
st_coordinates(st_transform(abs_spec, "EPSG:4326"))
) %>%
dplyr::rename(
longitude = X,
latitude = Y
) %>%
dplyr::mutate(
species = !!species,
present = 0
)
# Create presence-absence dataframe
pa_spec = occs_spec %>%
dplyr::mutate(
present = 1,
record_type = "present"
) %>%
bind_rows(abs_spec)
# ref = st_bbox(sampling_region)[c(1,3,2,4)]
# ggplot() +
# ggtitle(species) +
# geom_sf(data = st_crop(sa_polygon, ref), fill = "#756445") +
# geom_sf(data = range_spec, alpha = 0, color = "black") +
# geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) +
# scale_color_discrete(name = "Presence") +
# theme_minimal()
#
# try({
# ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling_random", device = "pdf", scale = 2)
# })
# Define cross-validation folds
folds = tryCatch({
spatial_folds = suppressMessages(
blockCV::cv_spatial(
pa_spec,
column = "present",
k = 5,
progress = F, plot = F, report = F
)
)
spatial_folds$folds_ids
}, error = function(e){
NA
})
pa_spec$fold_eval = folds
save(pa_spec, file = paste0("data/r_objects/pa_sampling_random_extended/", species, ".RData"))
}
# Combine presence-absence data
files = list.files("data/r_objects/pa_sampling_random_extended/", full.names = T)
model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>%
bind_rows() %>%
group_by(species, present) %>%
mutate(weight = 1/n()) %>%
ungroup()
save(model_data, file = "data/r_objects/model_data_random_abs_extended.RData")
library(tidyverse)
library(sf)
library(terra)
source("R/utils.R")
load("data/r_objects/model_data.RData")
load("data/r_objects/sa_polygon.RData")
sa_polygon = sf::st_transform(sa_polygon, crs(model_data))
# ---------------------------------------#
# Plot all presences in the dataset ####
# ---------------------------------------#
data_extent = ext(model_data)
presences = model_data %>%
dplyr::filter(present == 1)
template_raster <- terra::rast(data_extent, resolution = 1000) # 1000m = 1km
crs(template_raster) <- st_crs(model_data)$wkt
point_counts <- terra::rasterize(
vect(presences),
template_raster,
field = NULL,
fun = "count",
background = 0
) %>%
terra::mask(vect(sa_polygon))
length(which(values(point_counts) > 0))
# only 36858 raster cells contain at least 1 presence record
ggplot() +
tidyterra::geom_spatraster(data = point_counts, maxcell = 5e7) +
scale_fill_gradientn(
colors = c("black", "yellow", "darkred"),
values = scales::rescale(c(0, 1, max(point_counts_df$count, na.rm = TRUE))),
breaks = c(0, 1, max(point_counts_df$count, na.rm = TRUE)),
na.value = "transparent"
) +
geom_sf(data = sa_polygon, fill = NA, color = "gray", size = 0.5) +
theme_minimal() +
coord_sf() +
labs(title = "Coordinate distribution and density")
ggsave("coordinate_density.pdf", path = "plots/", device = "pdf", scale = 4)
# ---------------------------------------#
# Plot all presences in the dataset ####
# ---------------------------------------#
library(dplyr)
library(tidyr)
library(cito)
source("R/utils.R")
load("data/r_objects/model_data.RData")
load("data/r_objects/func_dist.RData")
model_species = intersect(model_data$species, colnames(func_dist))
model_data = model_data %>%
dplyr::filter(
!is.na(fold_eval),
species %in% !!model_species
) %>%
dplyr::mutate(species = as.factor(species)) %>%
sf::st_drop_geometry()
# ----------------------------------------------------------------------#
# Train model ####
# ----------------------------------------------------------------------#
func_dist = func_dist[model_species, model_species]
embeddings = eigen(func_dist)$vectors[,1:10]
predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness")
formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species, weights = embeddings)"))
# 1. Cross validation
for(fold in 1:5){
# Prepare data
train_data = dplyr::filter(model_data, fold_eval != fold)
# Run model
converged = F
while(!converged){
msdm_embed_traits_fit = dnn(
formula,
data = train_data,
hidden = c(200L, 200L, 200L),
loss = "binomial",
epochs = 5000,
lr = 0.001,
batchsize = 4096,
dropout = 0.25,
burnin = 50,
optimizer = config_optimizer("adam"),
early_stopping = 200,
validation = 0.2,
device = "cuda"
)
if(min(msdm_embed_traits_fit$losses$valid_l, na.rm = T) < 0.4){
converged = T
}
}
save(msdm_embed_traits_fit, file = paste0("data/r_objects/msdm_embed_traits_results/msdm_embed_traits_fit_fold", fold,".RData"))
}
# Full model
msdm_embed_traits_fit = dnn(
formula,
data = model_data,
hidden = c(200L, 200L, 200L),
loss = "binomial",
epochs = 7500,
lr = 0.001,
baseloss = 1,
batchsize = 4096,
dropout = 0.25,
burnin = 500,
optimizer = config_optimizer("adam"),
early_stopping = 300,
validation = 0.2,
device = "cuda"
)
save(msdm_embed_traits_fit, file = paste0("data/r_objects/msdm_embed_traits_results/msdm_embed_traits_fit_full.RData"))
# ----------------------------------------------------------------------#
# Evaluate model ####
# ----------------------------------------------------------------------#
msdm_embed_traits_performance = lapply(1:5, function(fold){
load(paste0("data/r_objects/msdm_embed_traits_results/msdm_embed_traits_fit_fold", fold, ".RData"))
test_data_split = model_data %>%
dplyr::filter(fold_eval == fold) %>%
dplyr::group_split(species)
lapply(test_data_split, function(test_data_spec){
species = test_data_spec$species[1]
performance = tryCatch({
evaluate_model(msdm_embed_traits_fit, test_data_spec)
}, error = function(e){
list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_,
Precision = NA_real_, Recall = NA_real_, F1 = NA_real_,
TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_)
})
performance_summary = performance %>%
as_tibble() %>%
mutate(
species = !!species,
obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
fold_eval = !!fold,
model = "MSDM_embed_traits",
) %>%
tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value")
}) %>%
bind_rows()
}) %>%
bind_rows()
save(msdm_embed_traits_performance, file = paste0("data/r_objects/msdm_embed_traits_performance.RData"))
library(dplyr) library(dplyr)
library(tidyr) library(tidyr)
library(cito) library(caret)
library(ranger)
source("R/utils.R") source("R/utils.R")
load("data/r_objects/model_data.RData") load("data/r_objects/model_data_random_abs_extended.RData")
model_data = model_data %>% model_data = model_data %>%
dplyr::filter(!is.na(fold_eval)) %>% dplyr::filter(!is.na(fold_eval)) %>%
...@@ -48,10 +49,11 @@ for(fold in 1:5){ ...@@ -48,10 +49,11 @@ for(fold in 1:5){
metric = "Accuracy", metric = "Accuracy",
trControl = train_ctrl, trControl = train_ctrl,
tuneGrid = tune_grid, tuneGrid = tune_grid,
num.threads = 32 weights = data_train$weight,
num.threads = 48
) )
save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold,".RData")) save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData"))
} }
# Full model # Full model
...@@ -83,20 +85,21 @@ rf_fit = caret::train( ...@@ -83,20 +85,21 @@ rf_fit = caret::train(
tuneGrid = tune_grid tuneGrid = tune_grid
) )
save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_full.RData") save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
# ----------------------------------------------------------------------# # ----------------------------------------------------------------------#
# Evaluate model #### # Evaluate model ####
# ----------------------------------------------------------------------# # ----------------------------------------------------------------------#
msdm_rf_performance = lapply(1:5, function(fold){ msdm_rf_random_abs_extended_performance = lapply(1:5, function(fold){
load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold, ".RData")) load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData"))
test_data =dplyr::filter(model_data, fold_eval == fold) %>% test_data = model_data %>%
dplyr::filter(fold_eval == fold, record_type != "absence_background") %>%
sf::st_drop_geometry() sf::st_drop_geometry()
actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P")) actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P"))
probs = predict(rf_fit, test_data, type = "prob")$P probs = predict_new(rf_fit, test_data, type = "prob")
preds = predict(rf_fit, test_data, type = "raw") preds = predict_new(rf_fit, test_data, type = "class")
eval_dfs = data.frame( eval_dfs = data.frame(
species = test_data$species, species = test_data$species,
...@@ -139,12 +142,13 @@ msdm_rf_performance = lapply(1:5, function(fold){ ...@@ -139,12 +142,13 @@ msdm_rf_performance = lapply(1:5, function(fold){
species = !!species, species = !!species,
obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
fold_eval = !!fold, fold_eval = !!fold,
model = "MSDM_rf", model = "MSDM_rf_random_abs_extended",
) %>% ) %>%
tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") %>%
drop_na()
}) %>% }) %>%
bind_rows() bind_rows()
}) %>% }) %>%
bind_rows() bind_rows()
save(msdm_rf_performance, file = paste0("data/r_objects/msdm_rf_performance.RData")) save(msdm_rf_random_abs_extended_performance, file = paste0("data/r_objects/msdm_rf_random_abs_extended_performance.RData"))
library(dplyr)
library(tidyr)
library(caret)
library(ranger)
source("R/utils.R")
load("data/r_objects/model_data_random_abs.RData")
model_data = model_data %>%
dplyr::filter(!is.na(fold_eval)) %>%
dplyr::mutate(
species = as.factor(species),
present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P"))
)
predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness", "species")
full_data = model_data %>%
sf::st_drop_geometry()
train_ctrl = caret::trainControl(
search = "random",
classProbs = TRUE,
summaryFunction = caret::twoClassSummary,
savePredictions = "final"
)
# Run model
rf_fit = caret::train(
x = full_data[, predictors],
y = full_data$present_fct,
method = "ranger",
metric = "Accuracy",
trControl = train_ctrl,
tuneLength = 1,
num.threads = 48,
importance = 'impurity',
verbose = T
)
save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
varimp = varImp(rf_fit)
# Cross validation
for(fold in 1:5){
## Preparations #####
data_train = dplyr::filter(model_data, fold_eval != fold) %>%
sf::st_drop_geometry()
# Define caret training routine
train_ctrl = caret::trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = caret::twoClassSummary,
savePredictions = "final"
)
tune_grid = expand.grid(
mtry = c(2,4,6,8),
splitrule = "gini",
min.node.size = c(1,4,9,16)
)
# Run model
rf_fit = caret::train(
x = data_train[, predictors],
y = data_train$present_fct,
method = "ranger",
metric = "Accuracy",
trControl = train_ctrl,
tuneGrid = tune_grid,
num.threads = 48,
verbose = F
)
save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData"))
}
# ----------------------------------------------------------------------#
# Evaluate model ####
# ----------------------------------------------------------------------#
msdm_rf_random_abs_performance = lapply(1:5, function(fold){
load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData"))
test_data = dplyr::filter(model_data, fold_eval == fold) %>%
sf::st_drop_geometry()
actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P"))
probs = predict(rf_fit, test_data, type = "prob")$P
preds = predict(rf_fit, test_data, type = "raw")
eval_dfs = data.frame(
species = test_data$species,
actual,
probs,
preds
) %>%
group_by(species) %>%
group_split()
lapply(eval_dfs, function(eval_df_spec){
species = eval_df_spec$species[1]
performance = tryCatch({
auc = pROC::roc(eval_df_spec$actual, eval_df_spec$probs, levels = c("P", "A"), direction = ">")$auc
cm = caret::confusionMatrix(eval_df_spec$preds, eval_df_spec$actual, positive = "P")
list(
AUC = as.numeric(auc),
Accuracy = cm$overall["Accuracy"],
Kappa = cm$overall["Kappa"],
Precision = cm$byClass["Precision"],
Recall = cm$byClass["Recall"],
F1 = cm$byClass["F1"],
TP = cm$table["P", "P"],
FP = cm$table["P", "A"],
TN = cm$table["A", "A"],
FN = cm$table["A", "P"]
)
}, error = function(e){
list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_,
Precision = NA_real_, Recall = NA_real_, F1 = NA_real_,
TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_)
})
performance_summary = performance %>%
as_tibble() %>%
mutate(
species = !!species,
obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)),
fold_eval = !!fold,
model = "MSDM_rf_random_abs",
) %>%
tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value")
}) %>%
bind_rows()
}) %>%
bind_rows()
save(msdm_rf_random_abs_performance, file = paste0("data/r_objects/msdm_rf_random_abs_performance.RData"))
...@@ -12,10 +12,11 @@ library(plotly) ...@@ -12,10 +12,11 @@ library(plotly)
library(DT) library(DT)
load("../data/r_objects/model_data.RData") load("../data/r_objects/model_data.RData")
load("../data/r_objects/ssdm_results.RData")
load("../data/r_objects/msdm_embed_performance.RData") load("../data/r_objects/msdm_embed_performance.RData")
load("../data/r_objects/msdm_onehot_performance.RData")
load("../data/r_objects/msdm_rf_performance.RData") load("../data/r_objects/msdm_rf_performance.RData")
load("../data/r_objects/msdm_rf_random_abs_performance.RData")
load("../data/r_objects/msdm_rf_no_species_performance.RData")
load("../data/r_objects/msdm_rf_no_species_random_abs_performance.RData")
load("../data/r_objects/range_maps.RData") load("../data/r_objects/range_maps.RData")
load("../data/r_objects/range_maps_gridded.RData") load("../data/r_objects/range_maps_gridded.RData")
...@@ -54,10 +55,10 @@ lin_regression = function(x, y, family = "binomial"){ ...@@ -54,10 +55,10 @@ lin_regression = function(x, y, family = "binomial"){
} }
# Performance table # Performance table
performance = ssdm_results %>% performance = msdm_rf_performance %>%
bind_rows(msdm_embed_performance) %>% bind_rows(msdm_rf_random_abs_performance) %>%
bind_rows(msdm_onehot_performance) %>% bind_rows(msdm_rf_no_species_performance) %>%
bind_rows(msdm_rf_performance) %>% bind_rows(msdm_rf_no_species_random_abs_performance) %>%
dplyr::filter(fold_eval <= 3) %>% # TODO use all folds dplyr::filter(fold_eval <= 3) %>% # TODO use all folds
dplyr::group_by(species, model, metric) %>% dplyr::group_by(species, model, metric) %>%
dplyr::summarise(value = mean(value, na.rm = F)) %>% dplyr::summarise(value = mean(value, na.rm = F)) %>%
...@@ -174,8 +175,8 @@ Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling). ...@@ -174,8 +175,8 @@ Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling).
### Key findings: ### Key findings:
- MSDM algorithms score much higher across all performance algorithms - MSDM algorithms score much higher across all performance algorithms
- Among MSDM algorithms, RF outperforms NNs significantly - Among MSDM algorithms, RF outperforms NNs significantly
## Analysis ## Analysis
...@@ -364,7 +365,6 @@ bslib::card(plot, full_screen = T) ...@@ -364,7 +365,6 @@ bslib::card(plot, full_screen = T)
``` ```
::: :::
### Range coverage bias ### Range coverage bias
Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range. Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range.
...@@ -427,4 +427,4 @@ bslib::card(plot, full_screen = T) ...@@ -427,4 +427,4 @@ bslib::card(plot, full_screen = T)
plot = plot_performance(df_plot, metric = "recall", x_var = "range_bias", x_label = "Range coverage bias", x_log = F) plot = plot_performance(df_plot, metric = "recall", x_var = "range_bias", x_label = "Range coverage bias", x_log = F)
bslib::card(plot, full_screen = T) bslib::card(plot, full_screen = T)
``` ```
::: :::
\ No newline at end of file
...@@ -13,7 +13,7 @@ library(cito) ...@@ -13,7 +13,7 @@ library(cito)
source("R/utils.R") source("R/utils.R")
load("data/r_objects/model_data.RData") load("data/r_objects/model_data_random_abs_extended.RData")
load("data/r_objects/ssdm_results.RData") load("data/r_objects/ssdm_results.RData")
load("data/r_objects/msdm_embed_performance.RData") load("data/r_objects/msdm_embed_performance.RData")
load("data/r_objects/msdm_onehot_performance.RData") load("data/r_objects/msdm_onehot_performance.RData")
...@@ -154,7 +154,7 @@ library(cito) ...@@ -154,7 +154,7 @@ library(cito)
library(ranger) library(ranger)
# Define plotting function # Define plotting function
plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "nn", "msdm_embed", "msdm_onehot", "msdm_rf")){ plot_predictions = function(spec, model_data, raster_data, algorithms){
# Species data # Species data
load("data/r_objects/range_maps.RData") load("data/r_objects/range_maps.RData")
...@@ -165,14 +165,17 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam", ...@@ -165,14 +165,17 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam",
sf::st_transform(sf::st_crs(pa_spec)) sf::st_transform(sf::st_crs(pa_spec))
# Extract raster values into df # Extract raster values into df
bbox_spec = sf::st_bbox(pa_spec) %>% bbox_spec = sf::st_bbox(range_spec) %>%
expand_bbox(expansion = 0.5) expand_bbox(expansion = 0.25)
raster_crop = terra::crop(raster_data, bbox_spec) raster_crop = terra::crop(raster_data, bbox_spec)
pa_crop = pa_spec[st_intersects(pa_spec, st_as_sfc(bbox_spec), sparse = FALSE),]
new_data = raster_crop %>% new_data = raster_crop %>%
terra::values(dataframe = T, na.rm = T) %>% terra::values(dataframe = T, na.rm = T) %>%
dplyr::mutate(species = unique(pa_spec$species)) dplyr::mutate(species = unique(pa_spec$species))
plots = list()
# Make predictions # Make predictions
for(algorithm in algorithms){ for(algorithm in algorithms){
message(algorithm) message(algorithm)
...@@ -187,7 +190,16 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam", ...@@ -187,7 +190,16 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam",
predictions = factor(round(probabilities), levels = c("0", "1"), labels = c("A", "P")) predictions = factor(round(probabilities), levels = c("0", "1"), labels = c("A", "P"))
} else if(algorithm == "msdm_rf"){ } else if(algorithm == "msdm_rf"){
load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData") load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData")
predictions = predict(rf_fit, new_data, type = "raw") predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
} else if(algorithm == "msdm_rf_random_abs"){
load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
} else if(algorithm == "msdm_rf_fit_no_species"){
load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_full.RData")
predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
} else if(algorithm == "msdm_rf_fit_no_species_random_abs"){
load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_random_abs_full.RData")
predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48)
} else if(algorithm == "nn") { } else if(algorithm == "nn") {
load(paste0("data/r_objects/ssdm_results/full_models/", spec, "_nn_fit.RData")) load(paste0("data/r_objects/ssdm_results/full_models/", spec, "_nn_fit.RData"))
new_data_tmp = dplyr::select(new_data, -species) new_data_tmp = dplyr::select(new_data, -species)
...@@ -200,17 +212,26 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam", ...@@ -200,17 +212,26 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam",
} }
raster_pred = terra::rast(raster_crop, nlyrs = 1) raster_pred = terra::rast(raster_crop, nlyrs = 1)
values(raster_pred)[as.integer(rownames(new_data))] <- predictions values(raster_pred)[as.integer(rownames(new_data))] <- as.integer(predictions) - 1
# Plot # Plot
plot(raster_pred, p = ggplot() +
type = "classes", tidyterra::geom_spatraster(data = as.factor(raster_pred), maxcell = 5e7) +
levels = c("Absent", "Present"), scale_fill_manual(values = c("0" = "black", "1" = "yellow"), name = "Prediction", na.translate = FALSE) +
main = paste0("Range prediction (", toupper(algorithm), "): ", spec)) geom_sf(data = pa_crop, aes(shape = as.factor(present)), color = "#FFFFFF99") +
point_colors = sapply(pa_spec$present, function(x) ifelse(x == 0, "#000000AA", "#FFFFFFAA")) geom_sf(data = range_spec, col = "red", fill = NA) +
plot(pa_spec[,"present"], col = point_colors, add = T, pch = 16, cex = 0.7) scale_shape_manual(values = c("0" = 1, "1" = 4), name = "Observation") +
plot(range_spec, border = "red", lwd = 1.5, col = NA, add = T) theme_minimal() +
coord_sf() +
labs(title = paste0("Predictions vs Observations (", algorithm, "): ", spec)) +
guides(shape = guide_legend(
override.aes = list(color = "black") # This makes shapes black in the legend only
))
plots[[algorithm]] = p
} }
return(plots)
} }
# Load raster # Load raster
...@@ -221,14 +242,12 @@ raster_data = terra::rast(raster_filepaths) %>% ...@@ -221,14 +242,12 @@ raster_data = terra::rast(raster_filepaths) %>%
terra::crop(sf::st_bbox(sa_polygon)) %>% terra::crop(sf::st_bbox(sa_polygon)) %>%
terra::project(sf::st_crs(model_data)$input) terra::project(sf::st_crs(model_data)$input)
specs = sort(sample(levels(model_data$species), 30)) specs = sort(sample(levels(model_data$species), 4))
for(spec in specs){ for(spec in specs){
pdf(file = paste0("plots/range_predictions/", spec, " (msdm).pdf")) pdf(file = paste0("plots/range_predictions/", spec, " (msdm_rf).pdf"), width = 12)
tryCatch({ plots = plot_predictions(spec, model_data, raster_data, algorithms = c("msdm_rf", "msdm_rf_random_abs", "msdm_rf_fit_no_species", "msdm_rf_fit_no_species_random_abs"))
plot_predictions(spec, model_data, raster_data, algorithms = c("msdm_embed", "msdm_onehot", "msdm_rf")) lapply(plots, print)
}, finally = { dev.off()
dev.off()
})
} }
# ------------------------------------------------------------------ # # ------------------------------------------------------------------ #
...@@ -237,6 +256,7 @@ for(spec in specs){ ...@@ -237,6 +256,7 @@ for(spec in specs){
load("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData") load("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData")
load("data/r_objects/msdm_onehot_results/msdm_onehot_fit_full.RData") load("data/r_objects/msdm_onehot_results/msdm_onehot_fit_full.RData")
load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData") load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData")
load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData")
compare_species_predictions = function(model, sample_size){ compare_species_predictions = function(model, sample_size){
specs = sample(unique(model_data$species), 2, replace = F) specs = sample(unique(model_data$species), 2, replace = F)
...@@ -265,7 +285,7 @@ compare_species_predictions(msdm_onehot_fit, 500) ...@@ -265,7 +285,7 @@ compare_species_predictions(msdm_onehot_fit, 500)
compare_species_predictions(rf_fit, 500) compare_species_predictions(rf_fit, 500)
## --> Predictions for different species are weakly/moderately correlated in NN models (makes sense) ## --> Predictions for different species are weakly/moderately correlated in NN models (makes sense)
## --> Predictions for different species are always highly correlated in RF model (seems problematioc) ## --> Predictions for different species are always highly correlated in RF model (seems problematic)
# ------------------------------------------------------------------ # # ------------------------------------------------------------------ #
# 5. Compare predictions across models #### # 5. Compare predictions across models ####
......
- source: 05_performance_analysis.qmd
quarto-pub:
- id: 309c45c3-1515-4985-a435-9ffa1888c5e7
url: 'https://chrkoenig.quarto.pub/ssdm-performance-analysis-0a06'
- source: 05_01_performance_analysis_carsten.qmd
quarto-pub:
- id: f9cafa63-bfd4-4c00-b401-09ec137bd3ce
url: 'https://chrkoenig.quarto.pub/sdm-performance-carsten'
- source: 05_01_performance_analysis.qmd
quarto-pub:
- id: cfaa8a4d-31e3-410c-ba1a-399aad5582fd
url: 'https://chrkoenig.quarto.pub/sdm-performance-analysis-8272'
- source: 05_01_performance_report.qmd - source: 05_01_performance_report.qmd
quarto-pub: quarto-pub:
- id: 25ccfe7b-b9d4-4bc9-bbf8-823676aab7bd - id: 77b30762-b473-447d-be18-d4bbd6261fbf
url: 'https://chrkoenig.quarto.pub/sdm-performance-report-f08b' url: 'https://chrkoenig.quarto.pub/sdm-performance-report'
...@@ -42,18 +42,11 @@ predict_new = function(model, data, type = "prob"){ ...@@ -42,18 +42,11 @@ predict_new = function(model, data, type = "prob"){
return(probs) return(probs)
} else { } else {
preds = predict(model, data, type = "raw") preds = predict(model, data, type = "raw")
return(preds)
} }
} }
} }
if(class(model) %in% c("citodnn", "citodnnBootstrap")){
p1 = predict(model, df1, type = "response")[,1]
p2 = predict(model, df2, type = "response")[,1]
} else {
p1 = predict(model, df1, type = "prob")$P
p2 = predict(model, df2, type = "prob")$P
}
evaluate_model <- function(model, data) { evaluate_model <- function(model, data) {
# Accuracy: The proportion of correctly predicted instances (both true positives and true negatives) out of the total instances. # Accuracy: The proportion of correctly predicted instances (both true positives and true negatives) out of the total instances.
# Formula: Accuracy = (TP + TN) / (TP + TN + FP + FN) # Formula: Accuracy = (TP + TN) / (TP + TN + FP + FN)
......
env_vars.png

178 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment