Something went wrong on our end
-
Maria Voigt authored
higher doesn't seem to have influence on crashed jobs
Maria Voigt authoredhigher doesn't seem to have influence on crashed jobs
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
abundance_model.R 20.59 KiB
# Fit abundance model #
#--------------------------#
#----------------#
# Load Libraries #
#----------------#
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(MASS))
suppressPackageStartupMessages(library(gtools))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(foreach))
suppressPackageStartupMessages(library(doParallel))
suppressPackageStartupMessages(library(optparse))
#-----------------------------#
# command line option parsing #
#-----------------------------#
print(paste("Start model_fitting script", Sys.time()))
option_list <- list (
make_option(c("-i", "--input-directory"), dest = "input_directory",
type = "character", help = "directory with input files",
metavar = "/path/to/input-dir"),
make_option(c("-o", "--output-directory"), dest = "output_directory",
type = "character", help = "directory with output files",
metavar = "/path/to/output-dir"),
make_option("--ESW-aerial",
dest = "ESW_aerial",
type = "double",
help = "aerial effective strip width"),
make_option("--exclude-year", dest = "exclude_year", type = "integer",
default = NA, help = "year to exclude", metavar = "2015"),
make_option("--exclude-grid", dest = "exclude_grid", type = "integer",
default = NA, help = "index of grid-cells to exclude", metavar = "1"),
make_option("--include-aerial",
dest = "include_aerial", action="store_true",
default=FALSE, help="include aerial transects"),
make_option("--stability", action="store_true", default=FALSE,
help="do stability analysis"),
make_option(c("-q", "--quiet"), dest = "verbose_script",
action = "store_false",
default = TRUE,
help = "don't print all intermediate results")
)
# verbose option a bit counterintuitive
# because I make action store_false, when I say -q that
# means that verbose == F, which is quiet
options <- parse_args(OptionParser(option_list=option_list))
if (is.null(options$input_directory)) {
stop("input directory not specified, check --help")
}
if (is.null(options$output_directory)) {
stop("output directory not specified, check --help")
}
exclude_year_possibilities <- c(1999:2015)
if (!is.na(options$exclude_year) && !(options$exclude_year %in% exclude_year_possibilities)) {
stop(paste("exclude year must be between", min(exclude_year_possibilities), "and", max(exclude_year_possibilities)))
}
is_verbose <- options$verbose_script
if(is_verbose){print(paste("Reminder: Include aerial is ", options$include_aerial, ". Please
pay attention that the same is the case for post-processing."))}
# input directory
indir <- options$input_directory
if(is_verbose){print(paste("indir", indir))}
# directory in which output is written
outdir <- options$output_directory
if(is_verbose){print(paste("outdir", outdir))}
ESW_aerial <- options$ESW_aerial
if(is_verbose){print(paste("Aerial ESW", ESW_aerial))}
do_stability <- options$stability
if(is_verbose){print(paste("stability", do_stability))}
include_aerial <- options$include_aerial
if(is_verbose){print(paste("include_aerial", include_aerial))}
exclude_year <- options$exclude_year
if(is_verbose){print(paste("exclude year", exclude_year))}
exclude_grid <- options$exclude_grid
if(is_verbose){print(paste("exclude grid", exclude_grid))}
#---------#
# Globals #
#---------#
indir_fun <- "~/orangutan_density_distribution/src/functions"
if(is_verbose){print(paste("indir_fun", indir_fun))}
cl <- makeForkCluster(outfile = "")
registerDoParallel(cl)
source(file.path(indir_fun, "project_functions/scale.predictors.R"))
source(file.path(indir_fun, "roger_functions/rogers_model_functions.R"))
source(file.path(indir_fun, "generic/path.to.current.R"))
source(file.path(indir_fun, "roger_functions/aic_c_fac.r"))
source(file.path(indir_fun, "roger_functions/get_conf_set.r"))
#define offset ground
ESW <- 0.01595 #effective strip width in km
# ESW_aerial already defined
print(paste("this is ESW aerial:", ESW_aerial))
NCS <- 1.12 #nest construction rate from Spehar et al. 2010
PNB <- 0.88 # proportion of nest builders from Spehar et al. 2010
options("scipen" = 100, "digits" = 4)
if (is.na(exclude_year) & is.na(exclude_grid)){
name_suffix <- ""}
if(!is.na(exclude_year)){
name_suffix <- paste0("year_", exclude_year, "_")}
if(!is.na(exclude_grid)){
name_suffix <- paste0("gridcell_", exclude_grid, "_")}
#---------------#
# Import data #
#---------------#
geography_path <- path.to.current(indir, "geography_observation", "rds")
if(is_verbose){print(paste("geography-path", geography_path))}
geography <- readRDS(geography_path)
transects_path <- path.to.current(indir, "transects", "rds")
if(is_verbose){print(paste("transect_path", transects_path))}
transects <- readRDS(transects_path)
predictors_path <- path.to.current(indir, "predictors_observation_20", "rds")
if(is_verbose){print(paste("predictors-path", predictors_path))}
predictors <- readRDS(predictors_path)
# calculate x and y center
geography$unscaled_x_center <- rowMeans(cbind(geography$x_start, geography$x_end), na.rm = T)
geography$unscaled_y_center <- rowMeans(cbind(geography$y_start, geography$y_end), na.rm = T)
#--------------------------------#
# Transform and scale predictors #
#--------------------------------#
# Transform predictors
predictors[predictors$predictor == "distance_PA", "value"] <- sqrt(
predictors[predictors$predictor == "distance_PA", "value"])
predictors[predictors$predictor == "human_pop_dens", "value"] <- log(
predictors[predictors$predictor == "human_pop_dens", "value"] + 1)
predictors[predictors$predictor == "deforestation_gaveau", "value"] <- sqrt(
predictors[predictors$predictor == "deforestation_gaveau", "value"])
predictors[predictors$predictor == "plantation_distance", "value"] <- log(
predictors[predictors$predictor == "plantation_distance", "value"] + 1)
predictors[predictors$predictor == "pulp_distance", "value"] <- log(
predictors[predictors$predictor == "pulp_distance", "value"] + 1)
predictors[predictors$predictor == "palm_distance", "value"] <- log(
predictors[predictors$predictor == "palm_distance", "value"] + 1)
# STARTING THE SCALING
# SCALE PREDICTORS
# these are the predictors that will be used in the model
predictor_names_for_scaling <- c( "dem", "slope", "temp_mean", "rain_dry", "rain_var",
"ou_killings", "ou_killing_prediction", "human_pop_dens",
"perc_muslim", "peatswamp", "lowland_forest", "lower_montane_forest" ,
"road_dens", "distance_PA", "fire_dens", "deforestation_hansen",
"deforestation_gaveau", "plantation_distance", "pulp_distance", "palm_distance",
"dom_T_OC", "dom_T_PH")
# additional predictors that have to be scaled: year and x- and y-center
predictor_names_add <- c("year", "x_center", "y_center")
# prepare predictors data-frame
predictors <- dplyr::select(predictors, id, predictor, unscaled_year = year,
unscaled_value = value)
# need to get rid of occurrence data
predictors <- predictors %>%
inner_join(transects, by = "id")
# exclude aerial if that is needed
if (include_aerial == F){
predictors <- filter(predictors, group != "aerial")
}
# delete all rows that have zero
if(is_verbose){print("how many rows with na in scaled_value")
nrow(predictors[is.na(predictors$unscaled_value), ])}
# deleting is.na values here
predictors <- predictors[!is.na(predictors$unscaled_value), ]
# scale predictors
predictors_obs <- scale.predictors.observation(predictor_names_for_scaling,
predictor_names_add,
predictors,
geography)
predictors_obs <- geography %>%
dplyr::select(-c(year,
unscaled_x_center,
unscaled_y_center)) %>%
dplyr::select(-group) %>%
inner_join(transects, by = "id") %>%
inner_join(predictors_obs, by = "id")
# predictors used in model
predictor_names <- c("year", "temp_mean", "rain_var", "rain_dry", "dom_T_OC",
"peatswamp", "lowland_forest",
"lower_montane_forest", "deforestation_hansen",
"human_pop_dens", "ou_killing_prediction",
"perc_muslim" )
# ou density and offset term for ground and absence transects
other_predictors_obs <- filter(predictors_obs, group != "aerial")
# density
other_predictors_obs$ou_dens <- (other_predictors_obs$nr_nests/
(other_predictors_obs$length_km * ESW * 2)) *
(1/(other_predictors_obs$nest_decay * NCS * PNB))
# offset
other_predictors_obs$offset_term <- log(other_predictors_obs$length_km * ESW *
2 * other_predictors_obs$nest_decay *
NCS * PNB)
# ou density and offset term for aerial transects
if (include_aerial == T){
aerial_predictors_obs <- dplyr::filter(predictors_obs, group == "aerial")
# density
aerial_predictors_obs$ou_dens <- (aerial_predictors_obs$nr_nests/
(aerial_predictors_obs$length_km * ESW_aerial * 2)) *
(1/(aerial_predictors_obs$nest_decay * NCS * PNB))
# offset
aerial_predictors_obs$offset_term <- log(aerial_predictors_obs$length_km * ESW_aerial *
2 * aerial_predictors_obs$nest_decay *
NCS * PNB)
predictors_obs <- aerial_predictors_obs %>%
bind_rows(other_predictors_obs)
}else{
predictors_obs <- other_predictors_obs
}
# bind the two together
predictors_obs <- predictors_obs %>%
arrange(id) %>%
as.data.frame(.)
if(is_verbose){print("look at predictors_obs")
str(predictors_obs)
summary(predictors_obs)}
# save the relevant output for the prediction and the validation (USED IN THESE SCRIPTS)
saveRDS(predictors_obs, file = file.path(outdir, paste0("predictors_observation_scaled_",
name_suffix,
Sys.Date(), ".rds")))
#exclude_grid is an index (1:82) that needs to be translated into the actual index of the cell
grid_cell_nrs <- unique(predictors_obs$grid_id)
grid_cell_nr <- grid_cell_nrs[exclude_grid]
# now exclude the year that needs to be excluded
if (!is.na(exclude_year)){
predictors_excluded_year <- predictors_obs[predictors_obs$unscaled_year == exclude_year, ]
predictors_obs <- predictors_obs[predictors_obs$unscaled_year != exclude_year, ]}
# or the grid_cell
if (!is.na(exclude_grid)){
predictors_excluded_grid <- predictors_obs[predictors_obs$grid_id == grid_cell_nr, ]
predictors_obs <- predictors_obs[predictors_obs$grid_id != grid_cell_nr, ]
}
# also we increase maxit for the two cases,
# because then slightly less data
if (is.na(exclude_grid) & is.na(exclude_year)){
nr_maxit <- 250}else{
nr_maxit <- 250}
if(is_verbose){ print(paste("3. start making all_model_terms", Sys.time()))}
# #build models needed for analysis with a function
all_model_terms <- built.all.models(env.cov.names =
c( "year",
"temp_mean",
"rain_var",
"rain_dry",
"dom_T_OC",
"peatswamp",
"lowland_forest",
"lower_montane_forest",
"deforestation_hansen",
"human_pop_dens",
"ou_killing_prediction",
"perc_muslim"),
env.cov.int = list(),
env.cov.2 = c("rain_dry"))
print(paste("4. end making all model terms", Sys.time()))
m_terms <- c("1",
"year",
"temp_mean",
"rain_var",
"rain_dry",
"dom_T_OC",
"peatswamp",
"lowland_forest",
"lower_montane_forest",
"deforestation_hansen",
"human_pop_dens",
"ou_killing_prediction",
"perc_muslim",
"I(rain_dry^2)")
# save model_terms here
model_terms <- names(glm.nb(as.formula(paste("nr_nests~", paste(m_terms,
collapse = "+"),
"+ offset(offset_term)",
sep = "")),
data = predictors_obs,
control = glm.control(maxit = nr_maxit))$coefficients)
# prediction estimates
intercept <- rep(1, nrow(predictors_obs))
predictor_estimates <- cbind( intercept,
predictors_obs[ , predictor_names],
predictors_obs[ ,"rain_dry"] *
predictors_obs[ ,"rain_dry"])
names(predictor_estimates) <- c("intercept", predictor_names,
paste0("I(", "rain_dry", "^2)"))
# calculate stability of the full model if desired
if(do_stability){
if(is_verbose){print(paste("Start stability calculation", Sys.time()))}
full_model <- paste(
m_terms[all_model_terms[nrow(all_model_terms), ] == 1],
collapse = "+")
if(is_verbose){print(paste("This is the full-model", full_model))}
model <- as.formula(
paste("nr_nests ~", full_model, "+ offset(offset_term)"))
res_full <- glm.nb(model, data = predictors_obs,
control = glm.control(maxit = nr_maxit))
# HERE I CAN NOW USE THE OTHER FUNCTION
dfbeta_frame <- data.frame(slope=res_full$coefficients, res_full$coefficients+
t(apply(X=dfbeta(res_full),
MARGIN=2, FUN=range)))
names(dfbeta_frame) <- c("slope", "min", "max")
write.csv(dfbeta_frame, file.path(outdir,
paste0("glm_abundance_stability_",
Sys.Date(), ".csv")), row.names = T)
}
# #run models
if(is_verbose){print(paste("8. Start running models", Sys.time()))}
results_res <- foreach(i = 1:nrow(all_model_terms),
.combine = rbind) %dopar%{
# make results dataframe
if (is.na(exclude_year) & is.na(exclude_grid)){
result <- as.data.frame(matrix(NA, ncol = 3 *
length(model_terms) + 5,
nrow = 1))
names(result) <- c("model", paste("coeff", model_terms, sep = "_"),
paste("P",model_terms,sep = "_"),
paste("SE", model_terms, sep = "_"),
"theta", "SE.theta", "AIC", "R2"
)} else {
result <- as.data.frame(matrix(NA, ncol = 3 * length(model_terms) + 6,
nrow = 1))
names(result) <- c("model", paste("coeff", model_terms, sep = "_"),
paste("P",model_terms,sep = "_"),
paste("SE", model_terms, sep = "_"),
"theta", "SE.theta", "AIC", "R2",
"R2_cross")
}
# make model
model <- as.formula(
paste("nr_nests ~",
paste(m_terms[all_model_terms[i, ] == 1], collapse = "+"),
"+ offset(offset_term)"))
res <- glm.nb(model, data = predictors_obs,
control = glm.control(maxit = nr_maxit))
# model
result[ , "model"] <- paste(m_terms[all_model_terms[i, ] == 1], collapse = "+")
# coefficients
model_coefficients <- as.vector(res$coefficients)
result[ , paste0("coeff_", names(res$coefficients))] <- model_coefficients
# p value
result[ , paste0("P_", names(res$coefficients))] <-
summary(res)$coefficients[ , "Pr(>|z|)"][names(
res$coefficients)] #w/o parameter for autocor.
# SE
result[ , paste0("SE_", names(res$coefficients))] <-
summary(res)$coefficients[ , "Std. Error"][names(
res$coefficients)]#add line for SE
# theta
result[ , "theta"] <- summary(res)$theta
result[ , "SE.theta"] <- summary(res)$SE.theta
# aic in last column,
result[ , "AIC"] <- extractAIC(res)[2]
# what do I need to do
# I need to get only the prediction estimates columns that are true in
#all_model_terms[i, ]==1
predictors_obs_pred <- predictors_obs
predictors_obs_pred$offset_term <- 0
# Comparison of observed data vs prediction
prediction_per_transect <- predict.glm(res,
newdata = predictors_obs_pred,
type = "response")
comparison_lm = lm(log(predictors_obs$ou_dens + 1) ~
log(prediction_per_transect + 1) )
result[ , "R2"] <- summary(comparison_lm)$r.squared
# if we are excluding years, this is the test of predicted data vs observed data
# for this year (with which the model wasn't fitted)
if (!is.na(exclude_year)){
predictors_excluded_year_pred <- predictors_excluded_year
predictors_excluded_year_pred$offset_term <- 0
prediction_transect_excluded_year <- predict.glm(res,
newdata = predictors_excluded_year,
type = "response")
cross_lm_year = lm(log(predictors_excluded_year$ou_dens+ 1) ~
log(prediction_transect_excluded_year + 1))
result[ , "R2_cross"] <- summary(cross_lm_year)$r.squared
}
if (!is.na(exclude_grid)){
predictors_excluded_grid_pred <- predictors_excluded_grid
predictors_excluded_grid_pred$offset_term <- 0
prediction_transect_excluded_grid <- predict.glm(res,
newdata = predictors_excluded_grid,
type = "response")
cross_lm_grid = lm(log(predictors_excluded_grid$ou_dens+ 1) ~
log(prediction_transect_excluded_grid + 1))
result[ , "R2_cross"] <- summary(cross_lm_grid)$r.squared
}
return(result)
}
c_set <- cbind(as.character(results_res$model), conf.set(aic = results_res$AIC) )
names(c_set) <- c("model", names(c_set)[-1])
# for the export
abundMod_results <- results_res %>%
mutate(w_aic = c_set$w.aic)
# calculate coefficient summary
summary_mean_coefficients <- calculate.mean.coefficients(m_terms, results_res, c_set)
# make table for model output
c_set$model <- NULL
names(c_set)[1] <- "AIC"
names(c_set)
c_set$d.aic <- NULL
c_set$w.aic <- NULL
results_out <- right_join(results_res, c_set, by="AIC")
results_out <- results_out[order(results_out$AIC), ]
# save the relevant output for the prediction and the validation
saveRDS(abundMod_results, file = file.path(outdir, paste0("abundMod_results_",
name_suffix,
Sys.Date(), ".rds")))
# these are the terms that go into the model (need to guarantee same things in validation)
saveRDS(m_terms, file = file.path(outdir, paste0("m_terms_",
name_suffix,
Sys.Date(), ".rds")))
# save the model results for interpretation
write.csv(results_out,
file = file.path(outdir,
paste0("abundMod_results_",
name_suffix,
Sys.Date(), ".csv")))
# save the mean coefficients for interpretation
write.csv(summary_mean_coefficients,
file = file.path(outdir,
paste0("abundMod_mean_coefficients_",
name_suffix,
Sys.Date(), ".csv")))
save.image(file.path(outdir, paste0("abundance_model_fitting_",
name_suffix,
Sys.Date(), ".RData")))
print(paste("Finished model_fitting script, at", Sys.time()))