diff --git a/R/03_03_absence_preparation_random.R b/R/03_03_absence_preparation_random.R new file mode 100644 index 0000000000000000000000000000000000000000..7986d0bf8e98e534c7160479a7c1f4e8235ea9f4 --- /dev/null +++ b/R/03_03_absence_preparation_random.R @@ -0,0 +1,185 @@ +# General packages +library(dplyr) +library(tidyr) +library(furrr) + +# Geo packages +library(terra) +library(CoordinateCleaner) +library(sf) +library(geos) + +# Stat packages +library(MASS) +library(dismo) +library(spatialEco) +library(blockCV) +library(caret) + +source("R/utils.R") + +load("data/r_objects/range_maps.RData") +load("data/r_objects/occs_final.RData") + +# ---------------------------------------------------------------------------# +# Prepare Geodata #### +# ---------------------------------------------------------------------------# +proj_string = "+proj=tcea +lon_0=-58.0704167 +datum=WGS84 +units=m +no_defs" + +target_species = unique(occs_final$species) + +sa_polygon = rnaturalearth::ne_countries() %>% + dplyr::filter(continent == "South America") %>% + sf::st_union() + +raster_filepaths = list.files("~/symobio-modeling/data/geospatial/raster/", full.names = T) %>% + stringr::str_sort(numeric = T) + +sa_polygon = st_transform(sa_polygon, proj_string) +occs_final = st_transform(occs_final, proj_string) +range_maps = st_transform(range_maps, proj_string) +raster_data = terra::rast(raster_filepaths) %>% + terra::project(proj_string) + +# ---------------------------------------------------------------------------# +# Sample random absences #### +# # +# Absences are sampled at random # +# (1) within the defined sampling area (same number as presences) # +# (2) outside the defined sampling area (100 records) # +# ---------------------------------------------------------------------------# +species_processed = list.files("data/r_objects/pa_sampling_random_extended/", pattern = "RData") %>% + stringr::str_remove(".RData") + +for(species in setdiff(target_species, species_processed)){ + print(species) + # Prepare species occurrence data ##### + occs_spec = dplyr::filter(occs_final, species == !!species) + if(nrow(occs_spec) <= 1){ + warning("Skipping species ", species, ": Can't process species with a single presence record." ) + } + + occs_multipoint = occs_spec %>% + summarise(geometry = st_combine(geometry)) + + range_spec = dplyr::filter(range_maps, name_matched == !!species) %>% + rename(species = name_matched) + + # Define core sampling region + # 1. Union all points from range polygon and occurrences records + # 2. Build concave polygon from unioned points + # 3. Buffer by 50 km to extend sampling region into new environments + # 4. Crop by South America outline + # 5. Crop by buffered presence points + sampling_region = st_cast(range_spec, "MULTIPOINT") %>% + st_union(occs_multipoint) %>% + geos::geos_concave_hull(ratio = 0.3) %>% # Concave hull in sf requires geos 3.11 + st_as_sf() %>% + st_set_crs(proj_string) %>% + st_buffer(dist = 50000) %>% + st_intersection(sa_polygon) %>% + st_difference(st_buffer(occs_multipoint, dist = 50000)) + + # Sample balanced number of pseudo absences inside core region + abs_spec_list = list() + + samples_required = nrow(occs_spec) + while(samples_required != 0){ + sample_points = sf::st_sample(sampling_region, size = samples_required) + env_sample = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE) + + sample_abs = st_sf(env_sample, geometry = sample_points) %>% + drop_na() %>% + dplyr::mutate( + record_type = "absence_core" + ) + + abs_spec_list[[length(abs_spec_list)+1]] = sample_abs + + samples_required = samples_required - nrow(sample_abs) # Sometimes there are no env data for sample points, so keep sampling + } + + # Sample 100 pseudo absences outside core region + background_region = st_difference(sa_polygon, sampling_region) + samples_required = 100 + while(samples_required != 0){ + sample_points = sf::st_sample(background_region, size = samples_required) + env_sample = terra::extract(raster_data, terra::vect(sample_points), ID = FALSE) + + sample_abs = st_sf(env_sample, geometry = sample_points) %>% + drop_na() %>% + dplyr::mutate( + record_type = "absence_background" + ) + + abs_spec_list[[length(abs_spec_list)+1]] = sample_abs + + samples_required = samples_required - nrow(sample_abs) # Sometimes there are no env data for sample points, so keep sampling + } + + # Consolidate absences + abs_spec = bind_rows(abs_spec_list) + abs_spec = abs_spec %>% + bind_cols( + st_coordinates(st_transform(abs_spec, "EPSG:4326")) + ) %>% + dplyr::rename( + longitude = X, + latitude = Y + ) %>% + dplyr::mutate( + species = !!species, + present = 0 + ) + + # Create presence-absence dataframe + pa_spec = occs_spec %>% + dplyr::mutate( + present = 1, + record_type = "present" + ) %>% + bind_rows(abs_spec) + + # ref = st_bbox(sampling_region)[c(1,3,2,4)] + # ggplot() + + # ggtitle(species) + + # geom_sf(data = st_crop(sa_polygon, ref), fill = "#756445") + + # geom_sf(data = range_spec, alpha = 0, color = "black") + + # geom_sf(data = pa_spec, aes(color = as.factor(present)), size = 0.7) + + # scale_color_discrete(name = "Presence") + + # theme_minimal() + # + # try({ + # ggsave(paste0(species, ".pdf"), path = "plots/pa_sampling_random", device = "pdf", scale = 2) + # }) + + # Define cross-validation folds + folds = tryCatch({ + spatial_folds = suppressMessages( + blockCV::cv_spatial( + pa_spec, + column = "present", + k = 5, + progress = F, plot = F, report = F + ) + ) + + spatial_folds$folds_ids + }, error = function(e){ + NA + }) + + pa_spec$fold_eval = folds + + save(pa_spec, file = paste0("data/r_objects/pa_sampling_random_extended/", species, ".RData")) +} + +# Combine presence-absence data +files = list.files("data/r_objects/pa_sampling_random_extended/", full.names = T) +model_data = lapply(files, function(f){load(f); return(pa_spec)}) %>% + bind_rows() %>% + group_by(species, present) %>% + mutate(weight = 1/n()) %>% + ungroup() + +save(model_data, file = "data/r_objects/model_data_random_abs_extended.RData") diff --git a/R/03_03_dataset_exploration.R b/R/03_03_dataset_exploration.R new file mode 100644 index 0000000000000000000000000000000000000000..ad331734226f8f071007abe5e739d5fd83df1d4b --- /dev/null +++ b/R/03_03_dataset_exploration.R @@ -0,0 +1,53 @@ +library(tidyverse) +library(sf) +library(terra) + +source("R/utils.R") + +load("data/r_objects/model_data.RData") +load("data/r_objects/sa_polygon.RData") + +sa_polygon = sf::st_transform(sa_polygon, crs(model_data)) + +# ---------------------------------------# +# Plot all presences in the dataset #### +# ---------------------------------------# +data_extent = ext(model_data) + +presences = model_data %>% + dplyr::filter(present == 1) + +template_raster <- terra::rast(data_extent, resolution = 1000) # 1000m = 1km +crs(template_raster) <- st_crs(model_data)$wkt + +point_counts <- terra::rasterize( + vect(presences), + template_raster, + field = NULL, + fun = "count", + background = 0 +) %>% + terra::mask(vect(sa_polygon)) + +length(which(values(point_counts) > 0)) + +# only 36858 raster cells contain at least 1 presence record + +ggplot() + + tidyterra::geom_spatraster(data = point_counts, maxcell = 5e7) + + scale_fill_gradientn( + colors = c("black", "yellow", "darkred"), + values = scales::rescale(c(0, 1, max(point_counts_df$count, na.rm = TRUE))), + breaks = c(0, 1, max(point_counts_df$count, na.rm = TRUE)), + na.value = "transparent" + ) + + geom_sf(data = sa_polygon, fill = NA, color = "gray", size = 0.5) + + theme_minimal() + + coord_sf() + + labs(title = "Coordinate distribution and density") + +ggsave("coordinate_density.pdf", path = "plots/", device = "pdf", scale = 4) + +# ---------------------------------------# +# Plot all presences in the dataset #### +# ---------------------------------------# diff --git a/R/04_04_modelling_msdm_embed_informed.R b/R/04_04_modelling_msdm_embed_informed.R deleted file mode 100644 index 719f2ec11a62a24705c41e52799c2bf43f4a36bd..0000000000000000000000000000000000000000 --- a/R/04_04_modelling_msdm_embed_informed.R +++ /dev/null @@ -1,115 +0,0 @@ -library(dplyr) -library(tidyr) -library(cito) - -source("R/utils.R") - -load("data/r_objects/model_data.RData") -load("data/r_objects/func_dist.RData") - -model_species = intersect(model_data$species, colnames(func_dist)) - -model_data = model_data %>% - dplyr::filter( - !is.na(fold_eval), - species %in% !!model_species - ) %>% - dplyr::mutate(species = as.factor(species)) %>% - sf::st_drop_geometry() - -# ----------------------------------------------------------------------# -# Train model #### -# ----------------------------------------------------------------------# -func_dist = func_dist[model_species, model_species] -embeddings = eigen(func_dist)$vectors[,1:10] -predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness") -formula = as.formula(paste0("present ~ ", paste(predictors, collapse = '+'), " + ", "e(species, weights = embeddings)")) - -# 1. Cross validation -for(fold in 1:5){ - # Prepare data - train_data = dplyr::filter(model_data, fold_eval != fold) - - # Run model - converged = F - while(!converged){ - msdm_embed_traits_fit = dnn( - formula, - data = train_data, - hidden = c(200L, 200L, 200L), - loss = "binomial", - epochs = 5000, - lr = 0.001, - batchsize = 4096, - dropout = 0.25, - burnin = 50, - optimizer = config_optimizer("adam"), - early_stopping = 200, - validation = 0.2, - device = "cuda" - ) - - if(min(msdm_embed_traits_fit$losses$valid_l, na.rm = T) < 0.4){ - converged = T - } - } - - save(msdm_embed_traits_fit, file = paste0("data/r_objects/msdm_embed_traits_results/msdm_embed_traits_fit_fold", fold,".RData")) -} - -# Full model -msdm_embed_traits_fit = dnn( - formula, - data = model_data, - hidden = c(200L, 200L, 200L), - loss = "binomial", - epochs = 7500, - lr = 0.001, - baseloss = 1, - batchsize = 4096, - dropout = 0.25, - burnin = 500, - optimizer = config_optimizer("adam"), - early_stopping = 300, - validation = 0.2, - device = "cuda" -) - -save(msdm_embed_traits_fit, file = paste0("data/r_objects/msdm_embed_traits_results/msdm_embed_traits_fit_full.RData")) - -# ----------------------------------------------------------------------# -# Evaluate model #### -# ----------------------------------------------------------------------# -msdm_embed_traits_performance = lapply(1:5, function(fold){ - load(paste0("data/r_objects/msdm_embed_traits_results/msdm_embed_traits_fit_fold", fold, ".RData")) - - test_data_split = model_data %>% - dplyr::filter(fold_eval == fold) %>% - dplyr::group_split(species) - - lapply(test_data_split, function(test_data_spec){ - species = test_data_spec$species[1] - - performance = tryCatch({ - evaluate_model(msdm_embed_traits_fit, test_data_spec) - }, error = function(e){ - list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, - Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, - TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_) - }) - - performance_summary = performance %>% - as_tibble() %>% - mutate( - species = !!species, - obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), - fold_eval = !!fold, - model = "MSDM_embed_traits", - ) %>% - tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") - }) %>% - bind_rows() -}) %>% - bind_rows() - -save(msdm_embed_traits_performance, file = paste0("data/r_objects/msdm_embed_traits_performance.RData")) diff --git a/R/04_05_modelling_msdm_rf.R b/R/04_05_modelling_msdm_rf.R index 5e13f48214f0b20ef85842840f3de660d49ab775..0e349b8b083717169d4717f9014d9f2c524a5ac3 100644 --- a/R/04_05_modelling_msdm_rf.R +++ b/R/04_05_modelling_msdm_rf.R @@ -1,10 +1,11 @@ library(dplyr) library(tidyr) -library(cito) +library(caret) +library(ranger) source("R/utils.R") -load("data/r_objects/model_data.RData") +load("data/r_objects/model_data_random_abs_extended.RData") model_data = model_data %>% dplyr::filter(!is.na(fold_eval)) %>% @@ -48,10 +49,11 @@ for(fold in 1:5){ metric = "Accuracy", trControl = train_ctrl, tuneGrid = tune_grid, - num.threads = 32 + weights = data_train$weight, + num.threads = 48 ) - save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold,".RData")) + save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData")) } # Full model @@ -83,20 +85,21 @@ rf_fit = caret::train( tuneGrid = tune_grid ) -save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_full.RData") +save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData") # ----------------------------------------------------------------------# # Evaluate model #### # ----------------------------------------------------------------------# -msdm_rf_performance = lapply(1:5, function(fold){ - load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_fold", fold, ".RData")) +msdm_rf_random_abs_extended_performance = lapply(1:5, function(fold){ + load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData")) - test_data =dplyr::filter(model_data, fold_eval == fold) %>% + test_data = model_data %>% + dplyr::filter(fold_eval == fold, record_type != "absence_background") %>% sf::st_drop_geometry() actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P")) - probs = predict(rf_fit, test_data, type = "prob")$P - preds = predict(rf_fit, test_data, type = "raw") + probs = predict_new(rf_fit, test_data, type = "prob") + preds = predict_new(rf_fit, test_data, type = "class") eval_dfs = data.frame( species = test_data$species, @@ -139,12 +142,13 @@ msdm_rf_performance = lapply(1:5, function(fold){ species = !!species, obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), fold_eval = !!fold, - model = "MSDM_rf", + model = "MSDM_rf_random_abs_extended", ) %>% - tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") + tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") %>% + drop_na() }) %>% bind_rows() }) %>% bind_rows() -save(msdm_rf_performance, file = paste0("data/r_objects/msdm_rf_performance.RData")) +save(msdm_rf_random_abs_extended_performance, file = paste0("data/r_objects/msdm_rf_random_abs_extended_performance.RData")) diff --git a/R/04_06_rf_testing.R b/R/04_06_rf_testing.R new file mode 100644 index 0000000000000000000000000000000000000000..9057285d94f4516437072457058308eb43a6eca2 --- /dev/null +++ b/R/04_06_rf_testing.R @@ -0,0 +1,144 @@ +library(dplyr) +library(tidyr) +library(caret) +library(ranger) + +source("R/utils.R") + +load("data/r_objects/model_data_random_abs.RData") + +model_data = model_data %>% + dplyr::filter(!is.na(fold_eval)) %>% + dplyr::mutate( + species = as.factor(species), + present_fct = factor(present, levels = c("0", "1"), labels = c("A", "P")) + ) + +predictors = c("bio6", "bio17", "cmi", "rsds", "igfc", "dtfw", "igsw", "roughness", "species") + +full_data = model_data %>% + sf::st_drop_geometry() + +train_ctrl = caret::trainControl( + search = "random", + classProbs = TRUE, + summaryFunction = caret::twoClassSummary, + savePredictions = "final" +) + +# Run model +rf_fit = caret::train( + x = full_data[, predictors], + y = full_data$present_fct, + method = "ranger", + metric = "Accuracy", + trControl = train_ctrl, + tuneLength = 1, + num.threads = 48, + importance = 'impurity', + verbose = T +) + +save(rf_fit, file = "data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData") + +varimp = varImp(rf_fit) + +# Cross validation +for(fold in 1:5){ + ## Preparations ##### + data_train = dplyr::filter(model_data, fold_eval != fold) %>% + sf::st_drop_geometry() + + # Define caret training routine + train_ctrl = caret::trainControl( + method = "cv", + number = 5, + classProbs = TRUE, + summaryFunction = caret::twoClassSummary, + savePredictions = "final" + ) + + tune_grid = expand.grid( + mtry = c(2,4,6,8), + splitrule = "gini", + min.node.size = c(1,4,9,16) + ) + + # Run model + rf_fit = caret::train( + x = data_train[, predictors], + y = data_train$present_fct, + method = "ranger", + metric = "Accuracy", + trControl = train_ctrl, + tuneGrid = tune_grid, + num.threads = 48, + verbose = F + ) + + save(rf_fit, file = paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold,".RData")) +} + +# ----------------------------------------------------------------------# +# Evaluate model #### +# ----------------------------------------------------------------------# +msdm_rf_random_abs_performance = lapply(1:5, function(fold){ + load(paste0("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_fold", fold, ".RData")) + + test_data = dplyr::filter(model_data, fold_eval == fold) %>% + sf::st_drop_geometry() + + actual = factor(test_data$present, levels = c("0", "1"), labels = c("A", "P")) + probs = predict(rf_fit, test_data, type = "prob")$P + preds = predict(rf_fit, test_data, type = "raw") + + eval_dfs = data.frame( + species = test_data$species, + actual, + probs, + preds + ) %>% + group_by(species) %>% + group_split() + + + lapply(eval_dfs, function(eval_df_spec){ + species = eval_df_spec$species[1] + + performance = tryCatch({ + auc = pROC::roc(eval_df_spec$actual, eval_df_spec$probs, levels = c("P", "A"), direction = ">")$auc + cm = caret::confusionMatrix(eval_df_spec$preds, eval_df_spec$actual, positive = "P") + + list( + AUC = as.numeric(auc), + Accuracy = cm$overall["Accuracy"], + Kappa = cm$overall["Kappa"], + Precision = cm$byClass["Precision"], + Recall = cm$byClass["Recall"], + F1 = cm$byClass["F1"], + TP = cm$table["P", "P"], + FP = cm$table["P", "A"], + TN = cm$table["A", "A"], + FN = cm$table["A", "P"] + ) + }, error = function(e){ + list(AUC = NA_real_, Accuracy = NA_real_, Kappa = NA_real_, + Precision = NA_real_, Recall = NA_real_, F1 = NA_real_, + TP = NA_real_, FP = NA_real_, TN = NA_real_, FN = NA_real_) + }) + + performance_summary = performance %>% + as_tibble() %>% + mutate( + species = !!species, + obs = nrow(dplyr::filter(model_data, species == !!species, fold_eval != !!fold)), + fold_eval = !!fold, + model = "MSDM_rf_random_abs", + ) %>% + tidyr::pivot_longer(-any_of(c("species", "obs", "fold_eval", "model")), names_to = "metric", values_to = "value") + }) %>% + bind_rows() +}) %>% + bind_rows() + +save(msdm_rf_random_abs_performance, file = paste0("data/r_objects/msdm_rf_random_abs_performance.RData")) diff --git a/R/05_01_performance_report.qmd b/R/05_01_performance_report.qmd index 686a903180ac4f33b3988fc3740db968059b9c65..1c1b64d5e8fa93cc5641e9f324bb58b5002f053e 100644 --- a/R/05_01_performance_report.qmd +++ b/R/05_01_performance_report.qmd @@ -12,10 +12,11 @@ library(plotly) library(DT) load("../data/r_objects/model_data.RData") -load("../data/r_objects/ssdm_results.RData") load("../data/r_objects/msdm_embed_performance.RData") -load("../data/r_objects/msdm_onehot_performance.RData") load("../data/r_objects/msdm_rf_performance.RData") +load("../data/r_objects/msdm_rf_random_abs_performance.RData") +load("../data/r_objects/msdm_rf_no_species_performance.RData") +load("../data/r_objects/msdm_rf_no_species_random_abs_performance.RData") load("../data/r_objects/range_maps.RData") load("../data/r_objects/range_maps_gridded.RData") @@ -54,10 +55,10 @@ lin_regression = function(x, y, family = "binomial"){ } # Performance table -performance = ssdm_results %>% - bind_rows(msdm_embed_performance) %>% - bind_rows(msdm_onehot_performance) %>% - bind_rows(msdm_rf_performance) %>% +performance = msdm_rf_performance %>% + bind_rows(msdm_rf_random_abs_performance) %>% + bind_rows(msdm_rf_no_species_performance) %>% + bind_rows(msdm_rf_no_species_random_abs_performance) %>% dplyr::filter(fold_eval <= 3) %>% # TODO use all folds dplyr::group_by(species, model, metric) %>% dplyr::summarise(value = mean(value, na.rm = F)) %>% @@ -174,8 +175,8 @@ Code can be found on [GitLab](https://git.idiv.de/ye87zine/symobio-modeling). ### Key findings: -- MSDM algorithms score much higher across all performance algorithms -- Among MSDM algorithms, RF outperforms NNs significantly +- MSDM algorithms score much higher across all performance algorithms +- Among MSDM algorithms, RF outperforms NNs significantly ## Analysis @@ -364,7 +365,6 @@ bslib::card(plot, full_screen = T) ``` ::: - ### Range coverage bias Range coverage bias was calculated as 1 minus the ratio of the actual range coverage and the hypothetical range coverage if all observations were maximally spread out across the range. @@ -427,4 +427,4 @@ bslib::card(plot, full_screen = T) plot = plot_performance(df_plot, metric = "recall", x_var = "range_bias", x_label = "Range coverage bias", x_log = F) bslib::card(plot, full_screen = T) ``` -::: \ No newline at end of file +::: diff --git a/R/05_02_publication_analysis.R b/R/05_02_publication_analysis.R index f1b9d262008a17cde8ad3251f1b54d31c6b3f037..9e2d074e37342fadb7268e670ef90d671ef68f71 100644 --- a/R/05_02_publication_analysis.R +++ b/R/05_02_publication_analysis.R @@ -13,7 +13,7 @@ library(cito) source("R/utils.R") -load("data/r_objects/model_data.RData") +load("data/r_objects/model_data_random_abs_extended.RData") load("data/r_objects/ssdm_results.RData") load("data/r_objects/msdm_embed_performance.RData") load("data/r_objects/msdm_onehot_performance.RData") @@ -154,7 +154,7 @@ library(cito) library(ranger) # Define plotting function -plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam", "gbm", "rf", "nn", "msdm_embed", "msdm_onehot", "msdm_rf")){ +plot_predictions = function(spec, model_data, raster_data, algorithms){ # Species data load("data/r_objects/range_maps.RData") @@ -165,14 +165,17 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam", sf::st_transform(sf::st_crs(pa_spec)) # Extract raster values into df - bbox_spec = sf::st_bbox(pa_spec) %>% - expand_bbox(expansion = 0.5) + bbox_spec = sf::st_bbox(range_spec) %>% + expand_bbox(expansion = 0.25) raster_crop = terra::crop(raster_data, bbox_spec) + pa_crop = pa_spec[st_intersects(pa_spec, st_as_sfc(bbox_spec), sparse = FALSE),] + new_data = raster_crop %>% terra::values(dataframe = T, na.rm = T) %>% dplyr::mutate(species = unique(pa_spec$species)) + plots = list() # Make predictions for(algorithm in algorithms){ message(algorithm) @@ -187,7 +190,16 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam", predictions = factor(round(probabilities), levels = c("0", "1"), labels = c("A", "P")) } else if(algorithm == "msdm_rf"){ load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData") - predictions = predict(rf_fit, new_data, type = "raw") + predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48) + } else if(algorithm == "msdm_rf_random_abs"){ + load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData") + predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48) + } else if(algorithm == "msdm_rf_fit_no_species"){ + load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_full.RData") + predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48) + } else if(algorithm == "msdm_rf_fit_no_species_random_abs"){ + load("data/r_objects/msdm_rf/msdm_rf_fit_no_species_random_abs_full.RData") + predictions = predict(rf_fit, new_data, type = "raw", num.threads = 48) } else if(algorithm == "nn") { load(paste0("data/r_objects/ssdm_results/full_models/", spec, "_nn_fit.RData")) new_data_tmp = dplyr::select(new_data, -species) @@ -200,17 +212,26 @@ plot_predictions = function(spec, model_data, raster_data, algorithms = c("gam", } raster_pred = terra::rast(raster_crop, nlyrs = 1) - values(raster_pred)[as.integer(rownames(new_data))] <- predictions + values(raster_pred)[as.integer(rownames(new_data))] <- as.integer(predictions) - 1 # Plot - plot(raster_pred, - type = "classes", - levels = c("Absent", "Present"), - main = paste0("Range prediction (", toupper(algorithm), "): ", spec)) - point_colors = sapply(pa_spec$present, function(x) ifelse(x == 0, "#000000AA", "#FFFFFFAA")) - plot(pa_spec[,"present"], col = point_colors, add = T, pch = 16, cex = 0.7) - plot(range_spec, border = "red", lwd = 1.5, col = NA, add = T) + p = ggplot() + + tidyterra::geom_spatraster(data = as.factor(raster_pred), maxcell = 5e7) + + scale_fill_manual(values = c("0" = "black", "1" = "yellow"), name = "Prediction", na.translate = FALSE) + + geom_sf(data = pa_crop, aes(shape = as.factor(present)), color = "#FFFFFF99") + + geom_sf(data = range_spec, col = "red", fill = NA) + + scale_shape_manual(values = c("0" = 1, "1" = 4), name = "Observation") + + theme_minimal() + + coord_sf() + + labs(title = paste0("Predictions vs Observations (", algorithm, "): ", spec)) + + guides(shape = guide_legend( + override.aes = list(color = "black") # This makes shapes black in the legend only + )) + + plots[[algorithm]] = p } + + return(plots) } # Load raster @@ -221,14 +242,12 @@ raster_data = terra::rast(raster_filepaths) %>% terra::crop(sf::st_bbox(sa_polygon)) %>% terra::project(sf::st_crs(model_data)$input) -specs = sort(sample(levels(model_data$species), 30)) +specs = sort(sample(levels(model_data$species), 4)) for(spec in specs){ - pdf(file = paste0("plots/range_predictions/", spec, " (msdm).pdf")) - tryCatch({ - plot_predictions(spec, model_data, raster_data, algorithms = c("msdm_embed", "msdm_onehot", "msdm_rf")) - }, finally = { - dev.off() - }) + pdf(file = paste0("plots/range_predictions/", spec, " (msdm_rf).pdf"), width = 12) + plots = plot_predictions(spec, model_data, raster_data, algorithms = c("msdm_rf", "msdm_rf_random_abs", "msdm_rf_fit_no_species", "msdm_rf_fit_no_species_random_abs")) + lapply(plots, print) + dev.off() } # ------------------------------------------------------------------ # @@ -237,6 +256,7 @@ for(spec in specs){ load("data/r_objects/msdm_embed_results/msdm_embed_fit_full.RData") load("data/r_objects/msdm_onehot_results/msdm_onehot_fit_full.RData") load("data/r_objects/msdm_rf/msdm_rf_fit_full.RData") +load("data/r_objects/msdm_rf/msdm_rf_fit_random_abs_full.RData") compare_species_predictions = function(model, sample_size){ specs = sample(unique(model_data$species), 2, replace = F) @@ -265,7 +285,7 @@ compare_species_predictions(msdm_onehot_fit, 500) compare_species_predictions(rf_fit, 500) ## --> Predictions for different species are weakly/moderately correlated in NN models (makes sense) -## --> Predictions for different species are always highly correlated in RF model (seems problematioc) +## --> Predictions for different species are always highly correlated in RF model (seems problematic) # ------------------------------------------------------------------ # # 5. Compare predictions across models #### diff --git a/R/_publish.yml b/R/_publish.yml index 91c7c692506ebfc3ec58deb69d220df70c4025d8..c18ea3e7bdbaf8fc276bc95e87377c6f205af5a4 100644 --- a/R/_publish.yml +++ b/R/_publish.yml @@ -1,16 +1,4 @@ -- source: 05_performance_analysis.qmd - quarto-pub: - - id: 309c45c3-1515-4985-a435-9ffa1888c5e7 - url: 'https://chrkoenig.quarto.pub/ssdm-performance-analysis-0a06' -- source: 05_01_performance_analysis_carsten.qmd - quarto-pub: - - id: f9cafa63-bfd4-4c00-b401-09ec137bd3ce - url: 'https://chrkoenig.quarto.pub/sdm-performance-carsten' -- source: 05_01_performance_analysis.qmd - quarto-pub: - - id: cfaa8a4d-31e3-410c-ba1a-399aad5582fd - url: 'https://chrkoenig.quarto.pub/sdm-performance-analysis-8272' - source: 05_01_performance_report.qmd quarto-pub: - - id: 25ccfe7b-b9d4-4bc9-bbf8-823676aab7bd - url: 'https://chrkoenig.quarto.pub/sdm-performance-report-f08b' + - id: 77b30762-b473-447d-be18-d4bbd6261fbf + url: 'https://chrkoenig.quarto.pub/sdm-performance-report' diff --git a/R/utils.R b/R/utils.R index 1f9ec189484331ae200786eae0515028d9a1452b..d00b91a6355e24ee441a8baa1a22f5e55bbde698 100644 --- a/R/utils.R +++ b/R/utils.R @@ -42,18 +42,11 @@ predict_new = function(model, data, type = "prob"){ return(probs) } else { preds = predict(model, data, type = "raw") + return(preds) } } } -if(class(model) %in% c("citodnn", "citodnnBootstrap")){ - p1 = predict(model, df1, type = "response")[,1] - p2 = predict(model, df2, type = "response")[,1] -} else { - p1 = predict(model, df1, type = "prob")$P - p2 = predict(model, df2, type = "prob")$P -} - evaluate_model <- function(model, data) { # Accuracy: The proportion of correctly predicted instances (both true positives and true negatives) out of the total instances. # Formula: Accuracy = (TP + TN) / (TP + TN + FP + FN) diff --git a/env_vars.png b/env_vars.png new file mode 100644 index 0000000000000000000000000000000000000000..273e21027964862a1deec247aafe6ca9eaea63f4 Binary files /dev/null and b/env_vars.png differ