Skip to content
Snippets Groups Projects
Commit 87dd6af4 authored by ye87zine's avatar ye87zine
Browse files

implemented phylo and range models

parent 6213fe55
Branches
No related tags found
No related merge requests found
......@@ -108,4 +108,6 @@ range_dist = mean_minimum_distances %>%
column_to_rownames("species1") %>%
as.matrix()
range_dist = range_dist / max(range_dist) # Scale to (0,1)
save(range_dist, file = "data/r_objects/range_dist.RData")
......@@ -5,16 +5,16 @@ library(cito)
source("R/utils.R")
load("data/r_objects/model_data.RData")
load("data/r_objects/phylo_dist.RData")
load("data/r_objects/func_dist.RData")
# ----------------------------------------------------------------------#
# Prepare data ####
# ----------------------------------------------------------------------#
model_species = intersect(model_data$species, colnames(phylo_dist))
model_species = intersect(model_data$species, names(func_dist))
model_data_final = model_data %>%
dplyr::filter(species %in% !!model_species) %>%
# dplyr::mutate_at(vars(starts_with("layer")), scale) %>% # Scaling seems to make things worse often
# dplyr::mutate_at(vars(starts_with("layer")), ~as.vector(scale(.))) %>% # Scaling seems to make things worse often
dplyr::mutate(species_int = as.integer(as.factor(species)))
train_data = dplyr::filter(model_data_final, train == 1)
......@@ -31,6 +31,8 @@ predictors = paste0("layer_", 1:19)
# ----------------------------------------------------------------------#
# 1. Train
formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)"))
plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
msdm_fit_embedding_traits_static = dnn(
formula,
data = train_data,
......@@ -87,6 +89,8 @@ save(msdm_results_embedding_traits_static, file = "data/r_objects/msdm_results_e
# With training the embedding ####
# ------------------------------------------------------------ ------#
formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)"))
plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
msdm_fit_embedding_traits_trained = dnn(
formula,
data = train_data,
......
......@@ -31,6 +31,8 @@ predictors = paste0("layer_", 1:19)
# ----------------------------------------------------------------------#
# 1. Train
formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)"))
plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
msdm_fit_embedding_phylo_static = dnn(
formula,
data = train_data,
......@@ -87,6 +89,8 @@ save(msdm_results_embedding_phylo_static, file = "data/r_objects/msdm_results_em
# With training the embedding ####
# ------------------------------------------------------------ ------#
formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)"))
plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
msdm_fit_embedding_phylo_trained = dnn(
formula,
data = train_data,
......
library(dplyr)
library(tidyr)
library(cito)
source("R/utils.R")
load("data/r_objects/model_data.RData")
load("data/r_objects/range_dist.RData")
# ----------------------------------------------------------------------#
# Prepare data ####
# ----------------------------------------------------------------------#
model_species = intersect(model_data$species, colnames(range_dist))
model_data_final = model_data %>%
dplyr::filter(species %in% !!model_species) %>%
# dplyr::mutate_at(vars(starts_with("layer")), scale) %>% # Scaling seems to make things worse often
dplyr::mutate(species_int = as.integer(as.factor(species)))
train_data = dplyr::filter(model_data_final, train == 1)
test_data = dplyr::filter(model_data_final, train == 0)
sp_ind = match(model_species, colnames(range_dist))
range_dist = range_dist[sp_ind, sp_ind]
embeddings = eigen(range_dist)$vectors[,1:20]
mode(embeddings) = "numeric"
predictors = paste0("layer_", 1:19)
# ----------------------------------------------------------------------#
# Without training the embedding ####
# ----------------------------------------------------------------------#
# 1. Train
formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = F)"))
plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
msdm_fit_embedding_range_static = dnn(
formula,
data = train_data,
hidden = c(500L, 500L, 500L),
loss = "binomial",
activation = c("sigmoid", "leaky_relu", "leaky_relu"),
epochs = 15000L,
lr = 0.01,
baseloss = 1,
batchsize = nrow(train_data),
dropout = 0.1,
burnin = 100,
optimizer = config_optimizer("adam", weight_decay = 0.001),
lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
early_stopping = 250,
validation = 0.3,
device = "cuda",
)
save(msdm_fit_embedding_range_static, file = "data/r_objects/msdm_fit_embedding_range_static.RData")
# 2. Evaluate
# Per species
load("data/r_objects/msdm_fit_embedding_range_static.RData")
data_split = test_data %>%
group_by(species_int) %>%
group_split()
msdm_results_embedding_range_static = lapply(data_split, function(data_spec){
target_species = data_spec$species[1]
data_spec = dplyr::select(data_spec, -species)
msdm_performance = tryCatch({
evaluate_model(msdm_fit_embedding_range_static, data_spec)
}, error = function(e){
list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
})
performance_summary = tibble(
species = !!target_species,
obs = length(which(model_data$species == target_species)),
model = "MSDM_embed_informed_range_static",
auc = msdm_performance$AUC,
accuracy = msdm_performance$Accuracy,
kappa = msdm_performance$Kappa,
precision = msdm_performance$Precision,
recall = msdm_performance$Recall,
f1 = msdm_performance$F1
)
}) %>% bind_rows()
save(msdm_results_embedding_range_static, file = "data/r_objects/msdm_results_embedding_range_static.RData")
# -------------------------------------------------------------------#
# With training the embedding ####
# ------------------------------------------------------------ ------#
formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species_int, weights = embeddings, lambda = 0.00001, train = T)"))
plot(1, type="n", xlab="", ylab="", xlim=c(0, 15000), ylim=c(0, 0.7)) # empty plot with better limits, draw points in there
msdm_fit_embedding_range_trained = dnn(
formula,
data = train_data,
hidden = c(500L, 500L, 500L),
loss = "binomial",
activation = c("sigmoid", "leaky_relu", "leaky_relu"),
epochs = 15000L,
lr = 0.01,
baseloss = 1,
batchsize = nrow(train_data),
dropout = 0.1,
burnin = 100,
optimizer = config_optimizer("adam", weight_decay = 0.001),
lr_scheduler = config_lr_scheduler("reduce_on_plateau", patience = 100, factor = 0.7),
early_stopping = 250,
validation = 0.3,
device = "cuda",
)
save(msdm_fit_embedding_range_trained, file = "data/r_objects/msdm_fit_embedding_range_trained.RData")
# 2. Evaluate
load("data/r_objects/msdm_fit_embedding_range_trained.RData")
data_split = test_data %>%
group_by(species_int) %>%
group_split()
msdm_results_embedding_range_trained = lapply(data_split, function(data_spec){
target_species = data_spec$species[1]
data_spec = dplyr::select(data_spec, -species)
msdm_performance = tryCatch({
evaluate_model(msdm_fit_embedding_range_trained, data_spec)
}, error = function(e){
list(AUC = NA, Accuracy = NA, Kappa = NA, Precision = NA, Recall = NA, F1 = NA)
})
performance_summary = tibble(
species = !!target_species,
obs = length(which(model_data$species == target_species)),
model = "MSDM_embed_informed_range_trained",
auc = msdm_performance$AUC,
accuracy = msdm_performance$Accuracy,
kappa = msdm_performance$Kappa,
precision = msdm_performance$Precision,
recall = msdm_performance$Recall,
f1 = msdm_performance$F1
)
}) %>% bind_rows()
save(msdm_results_embedding_range_trained, file = "data/r_objects/msdm_results_embedding_range_trained.RData")
---
title: "sSDM Performance analysis"
title: "SDM Performance analysis"
format: html
editor: visual
engine: knitr
......@@ -66,7 +66,7 @@ performance = bind_rows(ssdm_results, msdm_results, msdm_results_embedding_train
## Summary
This document summarizes the performance of different sSDM and mMSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest.
This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest.
Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling).
......@@ -529,7 +529,7 @@ bslib::card(plot, full_screen = T)
Functional groups were assigned based on taxonomic order. The following groupings were used:
| Functional group | Taxomic orders |
|--------------------------|----------------------------------------------|
|-------------------|-----------------------------------------------------|
| large ground-dwelling | Carnivora, Artiodactyla, Cingulata, Perissodactyla |
| small ground-dwelling | Rodentia, Didelphimorphia, Soricomorpha, Paucituberculata, Lagomorpha |
| arboreal | Primates, Pilosa |
......
---
title: "mSDM comparison"
format: html
editor: visual
engine: knitr
---
```{r init, echo = FALSE, include = FALSE}
library(tidyverse)
library(sf)
library(plotly)
library(DT)
load("../data/r_objects/msdm_results_embedding_raw.RData")
load("../data/r_objects/msdm_results_embedding_traits_static.RData")
load("../data/r_objects/msdm_results_embedding_traits_trained.RData")
load("../data/r_objects/msdm_results_embedding_phylo_static.RData")
load("../data/r_objects/msdm_results_embedding_phylo_trained.RData")
load("../data/r_objects/msdm_results_embedding_range_static.RData")
load("../data/r_objects/msdm_results_embedding_range_trained.RData")
sf::sf_use_s2(use_s2 = FALSE)
```
```{r globals, echo = FALSE, include = FALSE}
results_embedding_raw = msdm_results_embedding_raw %>%
dplyr::mutate(
embedding = "raw",
training = "trained"
)
results_embedding_informed = c(
"msdm_results_embedding_phylo_static",
"msdm_results_embedding_phylo_trained",
"msdm_results_embedding_range_static",
"msdm_results_embedding_range_trained",
"msdm_results_embedding_traits_static",
"msdm_results_embedding_traits_trained"
)
results_embedding_informed_merged = lapply(results_embedding_informed, function(df_name){
df = get(df_name)
name_split = str_split_1(df_name, pattern = "_")
df$embedding = name_split[4]
df$training = name_split[5]
assign(df_name, df)
}) %>% bind_rows()
results_final = results_embedding_raw %>%
bind_rows(results_embedding_informed_merged) %>%
drop_na()
```
```{r globals, echo = FALSE}
auc_overview = results_final %>%
pivot_wider(names_from = model, values_from = auc, id_cols = c(species, obs)) %>%
dplyr::arrange(obs)
accuracy_overview = results_final %>%
pivot_wider(names_from = model, values_from = accuracy, id_cols = c(species, obs)) %>%
dplyr::arrange(obs)
f1_overview = results_final %>%
pivot_wider(names_from = model, values_from = f1, id_cols = c(species, obs)) %>%
dplyr::arrange(obs)
```
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment