---
title: "SDM Performance analysis"
format: html
editor: visual
engine: knitr
---

```{r init, echo = FALSE, include = FALSE}
library(tidyverse)
library(sf)
library(plotly)
library(DT)

load("../data/r_objects/model_data.RData")
load("../data/r_objects/ssdm_results.RData")
load("../data/r_objects/msdm_embed_performance.RData")

load("../data/r_objects/range_maps.RData")
load("../data/r_objects/range_maps_gridded.RData")
load("../data/r_objects/occs_final.RData")
load("../data/r_objects/functional_groups.RData")

sf::sf_use_s2(use_s2 = FALSE)
```

```{r globals, echo = FALSE, include = FALSE}
# Count occs per species
obs_count = model_data %>% 
  sf::st_drop_geometry() %>% 
  dplyr::filter(present == 1) %>% 
  dplyr::group_by(species) %>% 
  dplyr::summarise(obs = n())


# Regression functions
asym_regression = function(x, y){
  nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1))
  new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100))
  data.frame(
    x = new_x,
    fit = predict(nls_fit, newdata = data.frame(x = new_x))
  )
}

lin_regression = function(x, y, family = "binomial"){
  glm_fit = suppressWarnings(glm(y~x, family = family))
  new_x = seq(min(x), max(x), length.out = 100)
  data.frame(
    x = new_x,
    fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response")
  )
}

# Performance table
performance = ssdm_results %>% 
  bind_rows(msdm_embed_performance) %>% 
  dplyr::group_by(species, model, metric) %>% 
  dplyr::summarise(value = mean(value, na.rm = F)) %>% 
  dplyr::mutate(
    metric = stringr::str_to_lower(metric),
    value = case_when(
      ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5,
      ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0,
      .default = value
    )
  )


focal_metrics = unique(performance$metric)

plot_performance = function(df_plot, metric, x_var, x_label, x_log = T, reg_func = lin_regression) {
  df_plot = dplyr::filter(df_plot, metric == !!metric) %>% 
    dplyr::rename(x_var = !!x_var)
  
  # Calculate regression lines for each model and metric combination
  suppressWarnings({
    regression_lines = df_plot %>%
      group_by(model) %>%
      group_modify( ~ reg_func(.x$x_var, .x$value))
  })
  
  # Create base plot
  plot <- plot_ly() %>%
    layout(
      title = paste0("Model Performance vs. ", x_label),
      xaxis = list(title = x_label, type = ifelse(x_log, "log", "linear")),
      yaxis = list(title = metric),
      legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot
      margin = list(r = 150), # Add right margin to accommodate legend
      hovermode = 'closest'
    )
  
  # Points
  for (model_name in unique(df_plot$model)) {
    plot = plot %>%
      add_markers(
        data = filter(df_plot, model == model_name, metric %in% focal_metrics),
        x = ~ x_var,
        y = ~ value,
        color = model_name, # Set color to match legendgroup
        legendgroup = model_name,
        opacity = 0.6,
        name = ~ model,
        hoverinfo = 'text',
        text = ~ paste(
          "Species:", species, "<br>", x_label, ":", x_var, "<br>Value:", round(value, 3)
        )
      )
  }
  
  # Add regression lines
  for (model_name in unique(df_plot$model)) {
    reg_data = dplyr::filter(regression_lines, model == model_name)
    plot = plot %>%
      add_lines(
        data = reg_data,
        x = ~ x,
        y = ~ fit,
        color = model_name, # Set color to match legendgroup
        legendgroup = model_name,
        name = paste(model_name, '(fit)'),
        showlegend = FALSE
      )
  }
  
  return(plot)
}
```

## Summary

This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias, and functional group). The comparison of sSDM vs mSDM approaches is of particular interest.

Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling).

### Modeling overview:

#### General decisions

-   Balanced presences and absences for each species
-   Absence sampling from probability surface that balances geographical sampling bias and environmental preferences per species
    -   higher probability in areas that have been sampled more intensely
    -   lower probability in areas with environmental conditions similar to presence locations

![absence sampling sampling](images/Akodon%20boliviensis.pdf){width="100%" height="800"}

-   Eight predictors:
    -   Min Temperature of Coldest Month (bio6)
    -   Precipitation of Driest Quarter (bio17)
    -   Climate Moisture Index (cmi)
    -   Surface Downwelling Shortwave Flux (rsds)
    -   Tree canopy cover (igfc)
    -   Distance to freshwater (dtfw)
    -   Seasonal inundation(igsw)
    -   Topographic roughness (roughness)
-   Nested cross validation
    -   Outer (performance evaluation): Spatially blocked 5-fold cross validation for all models
    -   Inner (model training): Random split (1/5) in cito, Spatially blocked 5-fold cross validation in other models

#### sSDM Algorithms

-   Four Algorithms: Random Forest (RF), Gradient Boosting Machine (GBM), Generalized Additive Model (GAM), Neural Network (NN)

-   NN: Manual hyperparameter tuning, same settings across species

-   RF + GBM + GAM: Automated hyperparameter tuning (4 random combinations) per species

#### mSDM Algorithms

Multispecies Neural Network with species embedding (**MSDM_embed**)

-   prediction: probability of occurrence given a set of (environmental) inputs and species identity
-   embedding initialized at random
-   three hidden layers, sigmoid + leaky ReLu activations, binomial loss

### Key findings:

TBA

## Analysis

The table below shows the analysed modeling results.

```{r performance, echo = FALSE, message=FALSE, warnings=FALSE}
DT::datatable(performance) %>% 
  formatRound(columns="value", digits=3)
```

### Number of records

-   Model performance was generally better for species with more observations
-   Very poor performance below 50-100 observations

```{r, echo = FALSE, message=FALSE, warnings=FALSE}
df_plot = performance %>% 
  dplyr::left_join(obs_count, by = "species") 
```

::: panel-tabset
#### AUC

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "auc", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### F1

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "f1", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Cohen's kappa

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "kappa", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Accuracy

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "accuracy", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Precision

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "precision", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Recall

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "recall", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```
:::

### Range size

Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016).

```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE}
df_join = range_maps %>% 
  dplyr::mutate(range_size = as.numeric(sf::st_area(range_maps) / 1000000)) %>%  # range in sqkm
  sf::st_drop_geometry()

df_plot = performance %>% 
  inner_join(df_join, by = c("species" = "name_matched"))
```

::: panel-tabset
#### AUC

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "auc", x_var = "range_size", x_label = "Range size")
bslib::card(plot, full_screen = T)
```

#### F1

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "f1", x_var = "range_size", x_label = "Range size")
bslib::card(plot, full_screen = T)
```

#### Cohen's kappa

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "kappa", x_var = "range_size", x_label = "Range size", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Accuracy

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_size", x_label = "Range size")
bslib::card(plot, full_screen = T)
```

#### Precision

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "precision", x_var = "range_size", x_label = "Range size")
bslib::card(plot, full_screen = T)
```

#### Recall

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "recall", x_var = "range_size", x_label = "Range size")
bslib::card(plot, full_screen = T)
```
:::

### Range coverage

Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells.

$$
RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}}
$$

```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE}
occs_final_unproj = occs_final %>% sf::st_transform("EPSG:4326")

df_cells_total = range_maps_gridded %>%
  dplyr::rename("species" = name_matched) %>% 
  group_by(species) %>%
  summarise(cells_total = n()) %>% 
  st_drop_geometry()

df_cells_occ <- range_maps_gridded %>%
  st_join(occs_final_unproj, join = st_intersects) %>%
  filter(name_matched == species) %>%         # Filter only intersections of the same species
  group_by(species) %>%
  summarise(cells_occupied = n_distinct(geometry)) %>%
  st_drop_geometry()

df_join = df_cells_total %>% 
  dplyr::inner_join(df_cells_occ, by = "species") %>% 
  dplyr::mutate(range_cov = cells_occupied / cells_total) %>% 
  dplyr::select(species, range_cov)

df_plot = performance %>% 
  inner_join(df_join, by = "species")
```

::: panel-tabset
#### AUC

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "auc", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### F1

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "f1", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Cohen's kappa

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "kappa", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Accuracy

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Precision

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "precision", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Recall

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "recall", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```
:::