---
title: "SDM Performance report"
format: html
editor: visual
engine: knitr
---

```{r init, echo = FALSE, include = FALSE}
library(tidyverse)
library(sf)
library(plotly)
library(DT)

load("../data/r_objects/model_data.RData")
load("../data/r_objects/ssdm_results.RData")
load("../data/r_objects/msdm_embed_performance.RData")
load("../data/r_objects/msdm_onehot_performance.RData")
load("../data/r_objects/msdm_rf_performance.RData")

load("../data/r_objects/range_maps.RData")
load("../data/r_objects/range_maps_gridded.RData")
load("../data/r_objects/occs_final.RData")
load("../data/r_objects/functional_groups.RData")

sf::sf_use_s2(use_s2 = FALSE)
```

```{r globals, echo = FALSE, include = FALSE}
# Count occs per species
obs_count = model_data %>% 
  sf::st_drop_geometry() %>% 
  dplyr::filter(present == 1) %>% 
  dplyr::group_by(species) %>% 
  dplyr::summarise(obs = n())


# Regression functions
asym_regression = function(x, y){
  nls_fit = nls(y ~ 1 - (1-b) * exp(-c * log(x)), start = list(b = 0.1, c = 0.1))
  new_x = exp(seq(log(min(x)), log(max(x)), length.out = 100))
  data.frame(
    x = new_x,
    fit = predict(nls_fit, newdata = data.frame(x = new_x))
  )
}

lin_regression = function(x, y, family = "binomial"){
  glm_fit = suppressWarnings(glm(y~x, family = family))
  new_x = seq(min(x), max(x), length.out = 100)
  data.frame(
    x = new_x,
    fit = predict(glm_fit, newdata = data.frame(x = new_x), type = "response")
  )
}

# Performance table
performance = ssdm_results %>% 
  bind_rows(msdm_embed_performance) %>% 
  bind_rows(msdm_onehot_performance) %>% 
  bind_rows(msdm_rf_performance) %>% 
  dplyr::filter(fold_eval <= 3) %>% # TODO use all folds
  dplyr::group_by(species, model, metric) %>% 
  dplyr::summarise(value = mean(value, na.rm = F)) %>% 
  dplyr::mutate(
    metric = stringr::str_to_lower(metric),
    value = case_when(
      ((is.na(value) | is.nan(value)) & metric %in% c("auc", "f1", "accuracy", "precision", "recall")) ~ 0.5,
      ((is.na(value) | is.nan(value)) & metric %in% c("kappa")) ~ 0,
      .default = value
    )
  )

focal_metrics = c("accuracy", "auc", "f1", "kappa", "precision", "recall")

plot_performance = function(df_plot, metric, x_var, x_label, x_log = T, reg_func = lin_regression) {
  df_plot = dplyr::filter(df_plot, metric == !!metric) %>% 
    dplyr::rename(x_var = !!x_var)
  
  # Calculate regression lines for each model and metric combination
  suppressWarnings({
    regression_lines = df_plot %>%
      group_by(model) %>%
      group_modify( ~ reg_func(.x$x_var, .x$value))
  })
  
  # Create base plot
  plot <- plot_ly() %>%
    layout(
      title = paste0("Model Performance vs. ", x_label),
      xaxis = list(title = x_label, type = ifelse(x_log, "log", "linear")),
      yaxis = list(title = metric),
      legend = list(x = 1.1, y = 0.5), # Move legend to the right of the plot
      margin = list(r = 150), # Add right margin to accommodate legend
      hovermode = 'closest'
    )
  
  # Points
  for (model_name in unique(df_plot$model)) {
    plot = plot %>%
      add_markers(
        data = filter(df_plot, model == model_name, metric %in% focal_metrics),
        x = ~ x_var,
        y = ~ value,
        color = model_name, # Set color to match legendgroup
        legendgroup = model_name,
        opacity = 0.6,
        name = ~ model,
        hoverinfo = 'text',
        text = ~ paste(
          "Species:", species, "<br>", x_label, ":", x_var, "<br>Value:", round(value, 3)
        )
      )
  }
  
  # Add regression lines
  for (model_name in unique(df_plot$model)) {
    reg_data = dplyr::filter(regression_lines, model == model_name)
    plot = plot %>%
      add_lines(
        data = reg_data,
        x = ~ x,
        y = ~ fit,
        color = model_name, # Set color to match legendgroup
        legendgroup = model_name,
        name = paste(model_name, '(fit)'),
        showlegend = FALSE
      )
  }
  
  return(plot)
}
```

## Summary

This document summarizes the performance of different sSDM and mSDM algorithms for `r I(length(unique(performance$species)))` South American mammal species. Model performance is evaluated on `r I(xfun::numbers_to_words(length(focal_metrics)))` metrics (`r I(paste(focal_metrics, collapse = ', '))`) and analyzed along five potential influence factors (number of records, range size, range coverage, range coverage bias). The comparison of sSDM vs mSDM approaches is of particular interest.

Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling).

### Modeling overview:

#### General decisions

-   Balanced presences and absences for each species
-   Absence sampling from probability surface that balances geographical sampling bias and environmental preferences per species
    -   higher probability in areas that have been sampled more intensely
    -   lower probability in areas with environmental conditions similar to presence locations
-   Eight predictors:
    -   Min Temperature of Coldest Month (bio6)
    -   Precipitation of Driest Quarter (bio17)
    -   Climate Moisture Index (cmi)
    -   Surface Downwelling Shortwave Flux (rsds)
    -   Tree canopy cover (igfc)
    -   Distance to freshwater (dtfw)
    -   Seasonal inundation(igsw)
    -   Topographic roughness (roughness)
-   Nested cross validation
    -   Outer (performance evaluation): Spatially blocked 5-fold cross validation for all models
    -   Inner (model training): Random split (1/5) in cito, Spatially blocked 5-fold cross validation in other models

#### sSDM Algorithms

-   Four algorithms: Random Forest (RF), Gradient Boosting Machine (GBM), Generalized Additive Model (GAM), Neural Network (NN)

-   NN: Manual hyperparameter tuning, same settings across species

-   RF + GBM + GAM: Automated hyperparameter tuning (8 random combinations) per species

#### mSDM Algorithms

-   Three algorithms: Random Forest (MSDM_rf), Neural Network (MSDM_embed, MSDM_onehot)

-   Species identity part of the input data, internal representation then either as onehot vector (MSDM_rf, MSDM_onehot) or via embedding (MSDM_embed)

### Key findings:

- MSDM algorithms score much higher across all performance algorithms
- Among MSDM algorithms, RF outperforms NNs significantly

## Analysis

### Number of records

```{r, echo = FALSE, message=FALSE, warnings=FALSE}
df_plot = performance %>% 
  dplyr::left_join(obs_count, by = "species") 
```

::: panel-tabset
#### AUC

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "auc", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### F1

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "f1", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Cohen's kappa

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "kappa", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Accuracy

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "accuracy", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Precision

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "precision", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Recall

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "recall", x_var = "obs", x_label = "Number of records", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```
:::

### Range size

Range size was calculated based on polygon layers from the IUCN Red List of Threatened Species (2016).

```{r range_size, echo = FALSE, message=FALSE, warnings=FALSE}
df_join = range_maps %>% 
  dplyr::mutate(range_size = as.numeric(sf::st_area(range_maps) / 1000000)) %>%  # range in sqkm
  sf::st_drop_geometry()

df_plot = performance %>% 
  inner_join(df_join, by = c("species" = "name_matched"))
```

::: panel-tabset
#### AUC

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "auc", x_var = "range_size", x_label = "Range size")
bslib::card(plot, full_screen = T)
```

#### F1

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "f1", x_var = "range_size", x_label = "Range size")
bslib::card(plot, full_screen = T)
```

#### Cohen's kappa

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "kappa", x_var = "range_size", x_label = "Range size", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Accuracy

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_size", x_label = "Range size")
bslib::card(plot, full_screen = T)
```

#### Precision

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "precision", x_var = "range_size", x_label = "Range size")
bslib::card(plot, full_screen = T)
```

#### Recall

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "recall", x_var = "range_size", x_label = "Range size")
bslib::card(plot, full_screen = T)
```
:::

### Range coverage

Species ranges were split into continuous hexagonal grid cells of 1 degree diameter. Range coverage was then calculated as the number of grid cells containing at least one occurrence record divided by the number of total grid cells.

$$
RangeCoverage = \frac{N_{cells\_occ}}{N_{cells\_total}}
$$

```{r range_coverage, echo = FALSE, message=FALSE, warnings=FALSE}
occs_final_unproj = occs_final %>% sf::st_transform("EPSG:4326")

df_cells_total = range_maps_gridded %>%
  dplyr::rename("species" = name_matched) %>% 
  group_by(species) %>%
  summarise(cells_total = n()) %>% 
  st_drop_geometry()

df_cells_occ <- range_maps_gridded %>%
  st_join(occs_final_unproj, join = st_intersects) %>%
  filter(name_matched == species) %>%         # Filter only intersections of the same species
  group_by(species) %>%
  summarise(cells_occupied = n_distinct(geometry)) %>%
  st_drop_geometry()

df_join = df_cells_total %>% 
  dplyr::inner_join(df_cells_occ, by = "species") %>% 
  dplyr::mutate(range_cov = cells_occupied / cells_total) %>% 
  dplyr::select(species, range_cov)

df_plot = performance %>% 
  inner_join(df_join, by = "species")
```

::: panel-tabset
#### AUC

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "auc", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### F1

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "f1", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Cohen's kappa

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "kappa", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Accuracy

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Precision

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "precision", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```

#### Recall

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "recall", x_var = "range_cov", x_label = "Range coverage", reg_func = asym_regression)
bslib::card(plot, full_screen = T)
```
:::


### Range coverage bias

Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range.

$$
RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)}
$$

Higher bias values indicate that occurrence records are spatially more clustered within the range of the species.

```{r range_coverage_bias, echo = FALSE, message=FALSE, warnings=FALSE}
occs_final_unproj = occs_final %>% sf::st_transform("EPSG:4326")

df_occs_total = occs_final_unproj %>% 
  st_drop_geometry() %>% 
  group_by(species) %>% 
  summarise(occs_total = n())

df_join = df_occs_total %>% 
  dplyr::inner_join(df_cells_total, by = "species") %>% 
  dplyr::inner_join(df_cells_occ, by = "species") %>% 
  dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1)))

df_plot = performance %>% 
  inner_join(df_join, by = "species")
```

::: panel-tabset
#### AUC

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "auc", x_var = "range_bias", x_label = "Range coverage bias", x_log = F)
bslib::card(plot, full_screen = T)
```

#### F1

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "f1", x_var = "range_bias", x_label = "Range coverage bias", x_log = F)
bslib::card(plot, full_screen = T)
```

#### Accuracy

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "accuracy", x_var = "range_bias", x_label = "Range coverage bias", x_log = F)
bslib::card(plot, full_screen = T)
```

#### Precision

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "precision", x_var = "range_bias", x_label = "Range coverage bias", x_log = F)
bslib::card(plot, full_screen = T)
```

#### Recall

```{r echo = FALSE}
plot = plot_performance(df_plot, metric = "recall", x_var = "range_bias", x_label = "Range coverage bias", x_log = F)
bslib::card(plot, full_screen = T)
```
:::