Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
abundance_model.R 19.13 KiB
# Fit abundance model      #
#--------------------------#

#----------------#
# Load Libraries #
#----------------#

suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(MASS))
suppressPackageStartupMessages(library(gtools))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(foreach))
suppressPackageStartupMessages(library(doParallel))
suppressPackageStartupMessages(library(optparse))


#-----------------------------#
# command line option parsing #
#-----------------------------#
print(paste("Start model_fitting script", Sys.time()))


option_list <- list (
  make_option(c("-i", "--input-directory"),  dest = "input_directory",
              type = "character", help = "directory with input files",
              metavar = "/path/to/input-dir"),
  make_option(c("-o", "--output-directory"), dest = "output_directory",
              type = "character", help = "directory with output files",
              metavar = "/path/to/output-dir"),
  make_option("--exclude-year",    dest = "exclude_year", type = "integer",
             default = NA, help = "year to exclude", metavar = "2015"),
  make_option("--include-aerial",
              dest = "include_aerial", action="store_true",
              default=FALSE,      help="include aerial transects"),
  make_option("--stability",       action="store_true", default=FALSE,
              help="do stability analysis"),
  make_option(c("-q", "--quiet"), dest = "verbose_script",
              action = "store_false",
              default = TRUE,
              help = "don't print all intermediate results")
)
# verbose option a bit counterintuitive
# because I make action store_false, when I say -q that
# means that verbose == F, which is quiet

options <- parse_args(OptionParser(option_list=option_list))

if (is.null(options$input_directory)) {
  stop("input directory not specified, check --help")
}

if (is.null(options$output_directory)) {
  stop("output directory not specified, check --help")
}

exclude_year_possibilities <- c(1999:2015)

if (!is.na(options$exclude_year) && !(options$exclude_year %in% exclude_year_possibilities)) {
  stop(paste("exclude year must be between", min(exclude_year_possibilities), "and", max(exclude_year_possibilities)))
}

is_verbose <- options$verbose_script

if(is_verbose){print(paste("Reminder: Include aerial is ", options$include_aerial, ". Please
            pay attention that the same is the case for post-processing."))}
# input directory
indir <- options$input_directory
if(is_verbose){print(paste("indir", indir))}

# directory in which output is written
outdir <- options$output_directory
if(is_verbose){print(paste("outdir", outdir))}

do_stability <- options$stability
if(is_verbose){print(paste("stability", do_stability))}

include_aerial <- options$include_aerial
if(is_verbose){print(paste("include_aerial", include_aerial))}

exclude_year <- options$exclude_year
if(is_verbose){print(paste("exclude year", exclude_year))}



#---------#
# Globals #
#---------#

indir_fun <- "~/orangutan_density_distribution/src/functions"
if(is_verbose){print(paste("indir_fun", indir_fun))}

cl <- makeForkCluster(outfile = "")
registerDoParallel(cl)

source(file.path(indir_fun, "roger_functions/rogers_model_functions.R"))
source(file.path(indir_fun, "generic/path.to.current.R"))
source(file.path(indir_fun, "roger_functions/aic_c_fac.r"))
source(file.path(indir_fun, "roger_functions/get_conf_set.r"))

#define offset ground
ESW <- 0.01595  #effective strip width in km
ESW_aerial <- 0.075 # effective strip width for aerial transects
NCS <- 1.12   #nest construction rate from Spehar et al. 2010
PNB <- 0.88  #  proportion of nest builders from Spehar et al. 2010


options("scipen" = 100, "digits" = 4)


if (is.na(exclude_year)){
  name_suffix <- ""} else {
    name_suffix <- paste0(exclude_year, "_")
  }

#---------------#
#  Import data  #
#---------------#

geography_path <- path.to.current(indir, "geography_observation", "rds")
if(is_verbose){print(paste("geography-path", geography_path))}
geography <- readRDS(geography_path)

transects_path <- path.to.current(indir, "transects", "rds")
if(is_verbose){print(paste("transect_path", transects_path))}
transects <- readRDS(transects_path)


predictors_path <- path.to.current(indir, "predictors_observation", "rds")
if(is_verbose){print(paste("predictors-path", predictors_path))}
predictors <- readRDS(predictors_path)

# calculate x and y center
geography$unscaled_x_center <-  rowMeans(cbind(geography$x_start, geography$x_end), na.rm = T)
geography$unscaled_y_center <- rowMeans(cbind(geography$y_start, geography$y_end), na.rm = T)

#--------------------------------#
# Transform and scale predictors #
#--------------------------------#

# Transform predictors
predictors[predictors$predictor == "distance_PA", "value"] <- sqrt(
  predictors[predictors$predictor == "distance_PA", "value"])
predictors[predictors$predictor == "human_pop_dens", "value"] <- log(
  predictors[predictors$predictor == "human_pop_dens", "value"] + 1)
predictors[predictors$predictor == "deforestation_gaveau", "value"] <- sqrt(
  predictors[predictors$predictor == "deforestation_gaveau", "value"])
predictors[predictors$predictor == "plantation_distance", "value"] <- log(
  predictors[predictors$predictor == "plantation_distance", "value"] + 1)
predictors[predictors$predictor == "pulp_distance", "value"] <- log(
  predictors[predictors$predictor == "pulp_distance", "value"] + 1)
predictors[predictors$predictor == "palm_distance", "value"] <- log(
  predictors[predictors$predictor == "palm_distance", "value"] + 1)

# STARTING THE SCALING
# SCALE PREDICTORS
# these are the predictors that will be used in the model
predictor_names_for_scaling <- c( "dem", "slope", "temp_mean", "rain_dry", "rain_var",
                                  "ou_killings", "ou_killing_prediction", "human_pop_dens",
                                  "perc_muslim", "peatswamp", "lowland_forest", "lower_montane_forest" ,
                                  "road_dens", "distance_PA", "fire_dens", "deforestation_hansen",
                                  "deforestation_gaveau", "plantation_distance", "pulp_distance", "palm_distance",
                                  "dom_T_OC", "dom_T_PH")
# predictors used in model
predictor_names <- c("year", "temp_mean", "rain_var", "rain_dry", "dom_T_OC",
                    "peatswamp", "lowland_forest",
                    "lower_montane_forest", "deforestation_hansen",
                    "human_pop_dens", "ou_killing_prediction",
                    "perc_muslim" )

predictors <- dplyr::select(predictors, id, predictor, unscaled_year = year,
                            unscaled_value = value)


# need to get rid of occurrence data
predictors <- predictors %>%
  inner_join(transects, by = "id")


# exclude aerial if that is needed
if (include_aerial == F){
  predictors <- filter(predictors, group != "aerial")
}

# SCALE PREDICTORD
for (predictor_name in predictor_names_for_scaling){
  predictors[predictors$predictor == predictor_name,
             "value" ] <-
    as.numeric(as.vector(scale(predictors[predictors$predictor == predictor_name,
                                          "unscaled_value"])))
}


predictors$year <- as.numeric(as.vector(scale(predictors[ , "unscaled_year"])))



# delete all rows that have zero
if(is_verbose){print("how many rows with na in scaled_value")
  nrow(predictors[is.na(predictors$value),  ])}
# deleting is.na values here
predictors <- predictors[!is.na(predictors$value), ]

geography <- dplyr::select(geography, -c(year))

# Rename here1
predictors_obs <- predictors %>%
  dplyr::filter(predictor %in% predictor_names_for_scaling)%>%
  dcast(id + year ~ predictor,  value.var = "value") %>%
  inner_join(geography, by = "id")%>%
  dplyr::select(-group) %>%
  inner_join(transects, by = "id" )



predictors_obs_unscaled <- predictors %>%
  dplyr::filter(predictor %in% predictor_names_for_scaling)%>%
  dcast(id + unscaled_year ~ predictor,  value.var = "unscaled_value")


names(predictors_obs_unscaled)[-c(1,2)] <- paste0("unscaled_",
                                                  names(predictors_obs_unscaled)[-c(1,2)])

predictors_obs <- left_join(predictors_obs, predictors_obs_unscaled, by = "id")


# scale x and y center
predictors_obs$x_center <- as.numeric(as.vector(scale(predictors_obs[ ,
                                                                      "unscaled_x_center"])))

predictors_obs$y_center <- as.numeric(as.vector(scale(predictors_obs[ ,
                                                                      "unscaled_y_center"])))


#  ou density and offset term for ground and absence transects
other_predictors_obs <- filter(predictors_obs, group != "aerial")
# density
other_predictors_obs$ou_dens <- (other_predictors_obs$nr_nests/
                                   (other_predictors_obs$length_km * ESW * 2))  *
  (1/(other_predictors_obs$nest_decay * NCS * PNB))
# offset
other_predictors_obs$offset_term <- log(other_predictors_obs$length_km * ESW *
                                          2 * other_predictors_obs$nest_decay *
                                          NCS * PNB)


#  ou density and offset term for aerial transects

if (include_aerial == T){
aerial_predictors_obs <- dplyr::filter(predictors_obs, group == "aerial")
# density
aerial_predictors_obs$ou_dens <- (aerial_predictors_obs$nr_nests/
                                    (aerial_predictors_obs$length_km * ESW_aerial * 2))  *
  (1/(aerial_predictors_obs$nest_decay * NCS * PNB))
# offset
aerial_predictors_obs$offset_term <- log(aerial_predictors_obs$length_km * ESW_aerial *
                                           2 * aerial_predictors_obs$nest_decay *
                                           NCS * PNB)

predictors_obs <- aerial_predictors_obs %>%
  bind_rows(other_predictors_obs)
}else{
  predictors_obs <- other_predictors_obs
}

# bind the two together
predictors_obs <- predictors_obs %>%
  arrange(id) %>%
  dplyr::select(id, group, x_start:LU, length_km:nest_decay,
                year, deforestation_gaveau:temp_mean, x_center, y_center,
                unscaled_year:unscaled_temp_mean, unscaled_x_center,
                unscaled_y_center, ou_dens, offset_term) %>%
  as.data.frame(.)


if(is_verbose){print("look at predictors_obs")
  str(predictors_obs)
  summary(predictors_obs)}

# save this
# save the relevant output for the prediction and the validation
saveRDS(predictors_obs, file = file.path(outdir, paste0("predictors_obs_",
                                                          name_suffix,
                                                          Sys.Date(), ".rds")))


# now exclude the year that needs to be excluded
if (!is.na(exclude_year)){
    predictors_excluded_year <- predictors_obs[predictors_obs$unscaled_year == exclude_year, ]
    predictors_obs <- predictors_obs[predictors_obs$unscaled_year != exclude_year, ]}

if(is_verbose){ print(paste("3. start making all_model_terms", Sys.time()))}

 # #build models needed for analysis with a function
all_model_terms <- built.all.models(env.cov.names =
                                      c( "year",
                                         "temp_mean",
                                         "rain_var",
                                         "rain_dry",
                                         "dom_T_OC",
                                         "peatswamp",
                                         "lowland_forest",
                                         "lower_montane_forest",
                                         "deforestation_hansen",
                                         "human_pop_dens",
                                         "ou_killing_prediction",
                                         "perc_muslim"),
                                    env.cov.int = list(),
                                    env.cov.2 = c("rain_dry"))



 print(paste("4. end making all model terms", Sys.time()))


m_terms <- c("1",
             "year",
             "temp_mean",
             "rain_var",
             "rain_dry",
             "dom_T_OC",
             "peatswamp",
             "lowland_forest",
             "lower_montane_forest",
             "deforestation_hansen",
             "human_pop_dens",
             "ou_killing_prediction",
             "perc_muslim",
             "I(rain_dry^2)")


# save model_terms here
model_terms <- names(glm.nb(as.formula(paste("nr_nests~", paste(m_terms,
                                                                collapse = "+"),
                                             "+ offset(offset_term)",
                                             sep = "")),
                              data = predictors_obs,
                            control = glm.control(maxit = 250))$coefficients)


# prediction estimates
intercept <- rep(1, nrow(predictors_obs))
predictor_estimates <- cbind( intercept,
                               predictors_obs[ , predictor_names],
                               predictors_obs[ ,"rain_dry"] *
                                 predictors_obs[ ,"rain_dry"])

names(predictor_estimates) <- c("intercept", predictor_names,
                                paste0("I(", "rain_dry", "^2)"))



# calculate stability of the full model if desired
if(do_stability){
  if(is_verbose){print(paste("Start stability calculation", Sys.time()))}
full_model <- paste(
  m_terms[all_model_terms[nrow(all_model_terms), ] == 1],
  collapse = "+")
if(is_verbose){print(paste("This is the full-model", full_model))}
model <- as.formula(
  paste("nr_nests ~", full_model, "+ offset(offset_term)"))

res_full <- glm.nb(model, data = predictors_obs,

                   control = glm.control(maxit = 250))


# HERE I CAN NOW USE THE OTHER FUNCTION
dfbeta_frame <- data.frame(slope=res_full$coefficients, res_full$coefficients+
                             t(apply(X=dfbeta(res_full),
                                     MARGIN=2, FUN=range)))
names(dfbeta_frame) <- c("slope", "min", "max")

write.csv(dfbeta_frame, file.path(outdir,
                                  paste0("glm_abundance_stability_",
                                         Sys.Date(), ".csv")), row.names = T)
}

# #run models
if(is_verbose){print(paste("8. Start running models", Sys.time()))}


if (is.na(exclude_year)){
result <- as.data.frame(matrix(NA, ncol = 3 * length(model_terms) + 4, #5,
                               nrow = 1))
names(result) <- c("model", paste("coeff", model_terms, sep = "_"),
                   paste("P",model_terms,sep = "_"),
                   paste("SE", model_terms, sep = "_"),
		   "theta", "SE.theta", "AIC" #, "R2"
                   )
} else {
    result <- as.data.frame(matrix(NA, ncol = 3 * length(model_terms) +5, #+ 6,
                                   nrow = 1))
    names(result) <- c("model", paste("coeff", model_terms, sep = "_"),
                       paste("P",model_terms,sep = "_"),
                       paste("SE", model_terms, sep = "_"),
                       "theta", "SE.theta", "AIC", #"R2",
                       "R2_cross")
}


results_res <- foreach(i = 1:nrow(all_model_terms),
                       .combine = rbind) %dopar%{
    model <- as.formula(
            paste("nr_nests ~",
	    paste(m_terms[all_model_terms[i, ] == 1], collapse = "+"),
            "+ offset(offset_term)"))
            res <- glm.nb(model, data = predictors_obs,
                          control = glm.control(maxit = 250))


# model
    result[ , "model"] <- paste(m_terms[all_model_terms[i, ] == 1], collapse = "+")

    # coefficients
    model_coefficients <- as.vector(res$coefficients)
    result[ , paste0("coeff_", names(res$coefficients))] <- model_coefficients
    # p value
    result[ , paste0("P_", names(res$coefficients))] <-
      summary(res)$coefficients[ , "Pr(>|z|)"][names(
        res$coefficients)] #w/o parameter for autocor.

    # SE
    result[ , paste0("SE_", names(res$coefficients))] <-
      summary(res)$coefficients[ , "Std. Error"][names(
        res$coefficients)]#add line for SE

    # theta
   result[ , "theta"] <- summary(res)$theta
   result[ , "SE.theta"] <- summary(res)$SE.theta

    # aic in last column,
    result[ , "AIC"] <- extractAIC(res)[2]

    # what do I need to do
    # I need to get only the prediction estimates columns that are true in
    # all_model_terms[i, ]==1
    # predictors_obs_pred <- predictors_obs
    # predictors_obs_pred$offset_term <- 0
    # Comparison of observed data vs prediction
    #  prediction_per_transect <-  predict.glm(res,
    #                                        newdata = predictors_obs_pred,
    #                                        type = "response")

   # comparison_lm = lm(log(predictors_obs$nr_ou_per_km2 + 1) ~
   #                    log(prediction_per_transect + 1) )

#    result[ , "R2"] <- summary(comparison_lm)$r.squared
    # if we are excluding years, this is the test of predicted data vs observed data
    # for this year (with which the model wasn't fitted)
    if (!is.na(exclude_year)){
  prediction_transect_excluded_year <-  predict.glm(res,
                               newdata = predictors_excluded_year,
			       type = "response")
  cross_lm = lm(log(predictors_excluded_year$nr_ou_per_km2 + 1) ~
                     log(prediction_transect_excluded_year + 1))

  result[ , "R2_cross"] <- summary(cross_lm)$r.squared
    }

  return(result)
}


c_set <- cbind(as.character(results_res$model), conf.set(aic = results_res$AIC) )
names(c_set) <- c("model", names(c_set)[-1])

# for the export
abundMod_results <- results_res %>%
  mutate(w_aic = c_set$w.aic)


# calculate coefficient summary
summary_mean_coefficients <- calculate.mean.coefficients(m_terms, results_res, c_set)

# make table for model output
c_set$model <- NULL
names(c_set)[1] <- "AIC"
names(c_set)
c_set$d.aic <- NULL
c_set$w.aic <- NULL
results_out <- right_join(results_res, c_set,  by="AIC")
results_out <- results_out[order(results_out$AIC), ]

# save the relevant output for the prediction and the validation
saveRDS(abundMod_results, file = file.path(outdir, paste0("abundMod_results_",
                                                          name_suffix,
                                                Sys.Date(), ".rds")))

# these are the terms that go into the model (need to guarantee same things in validation)
saveRDS(m_terms, file = file.path(outdir, paste0("m_terms_",
                                                 name_suffix,
                                                 Sys.Date(), ".rds")))

# save the model results for interpretation
write.csv(results_out,
          file = file.path(outdir,
                           paste0("abundMod_results_",
                                  name_suffix,
                                  Sys.Date(), ".csv")))
# save the mean coefficients for interpretation
write.csv(summary_mean_coefficients,
          file = file.path(outdir,
                           paste0("abundMod_mean_coefficients_",
                                  name_suffix,
                                  Sys.Date(), ".csv")))


save.image(file.path(outdir, paste0("abundance_model_fitting_",
                                    name_suffix,
                                    Sys.Date(), ".RData")))

print(paste("Finished model_fitting script, at", Sys.time()))