From 0297e6abc1bc3f2486b283886bf16c55daa90c73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <ye87zine@usr.idiv.de>
Date: Mon, 28 Oct 2024 12:39:52 +0100
Subject: [PATCH] changed bias metric yet again

---
 R/performance_analysis.qmd | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/R/performance_analysis.qmd b/R/performance_analysis.qmd
index 252c7bb..a0d098e 100644
--- a/R/performance_analysis.qmd
+++ b/R/performance_analysis.qmd
@@ -37,7 +37,7 @@ Key findings:
 -   RF performed best, GBM slightly worse, GLM worst
 -   More occurrence records and larger range sizes tended to improve model performance
 -   Higher range coverage correlated with better performance.
--   Range coverage bias and functional group showed some impact but were less consistent <!-- TODO: check after rerun -->
+-   Range coverage bias and functional group showed some impact but were less consistent
 
 ## Analysis
 
@@ -263,10 +263,10 @@ bslib::card(plot, full_screen = T)
 
 #### Range coverage bias
 
-Range coverage bias was calculated as the as the minimum of total grid cells and total occurrences divided by the number of occupied cells.
+Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range.
 
 $$
-RangeCoverageBias = \frac{min(N_{cells\_total}, N_{obs\_total})}{N_{cells\_occupied}}
+RangeCoverageBias = 1 - \frac{RangeCoverage}{min({N_{obs\_total}} / {N_{cells\_total}}, 1)}
 $$
 
 Higher bias values indicate that occurrence records are spatially more clustered within the range of the species.
@@ -282,7 +282,7 @@ df_occs_total = occs_final %>%
 df_join = df_occs_total %>% 
   dplyr::inner_join(df_cells_total, by = "species") %>% 
   dplyr::inner_join(df_cells_occ, by = "species") %>% 
-  dplyr::mutate(range_bias = pmin(cells_total, occs_total) / cells_occupied)
+  dplyr::mutate(range_bias = 1-((cells_occupied / cells_total) / pmin(occs_total / cells_total, 1)))
 
 df_plot = performance %>% 
   inner_join(df_join, by = "species")
@@ -312,7 +312,7 @@ plot <- plot_ly(
 plot <- plot %>%
   layout(
     title = "Model Performance vs. Range coverage bias",
-    xaxis = list(title = "Range coverage bias", type = "log"),
+    xaxis = list(title = "Range coverage bias"),
     yaxis = list(title = "Value"),
     legend = list(x = 1.1, y = 0.5),  # Move legend to the right of the plot
     margin = list(r = 150),  # Add right margin to accommodate legend
-- 
GitLab