Skip to content
Snippets Groups Projects
Commit fd294da8 authored by Maria Voigt's avatar Maria Voigt
Browse files

merging abundance model from laptop

parents 6b9c431b d46119c0
No related branches found
No related tags found
No related merge requests found
......@@ -102,6 +102,7 @@ source(file.path(indir_fun, "roger_functions/get_conf_set.r"))
#define offset ground
ESW <- 0.01595 #effective strip width in km
ESW_aerial <- 0.075 # effective strip width for aerial transects
NCS <- 1.12 #nest construction rate from Spehar et al. 2010
PNB <- 0.88 # proportion of nest builders from Spehar et al. 2010
......@@ -109,6 +110,11 @@ PNB <- 0.88 # proportion of nest builders from Spehar et al. 2010
options("scipen" = 100, "digits" = 4)
if (is.na(exclude_year)){
name_suffix <- ""} else {
name_suffix <- paste0(exclude_year, "_")
}
#---------------#
# Import data #
#---------------#
......@@ -126,90 +132,154 @@ predictors_path <- path.to.current(indir, "predictors_observation", "rds")
if(is_verbose){print(paste("predictors-path", predictors_path))}
predictors <- readRDS(predictors_path)
# calculate x and y center
geography$unscaled_x_center <- rowMeans(cbind(geography$x_start, geography$x_end), na.rm = T)
geography$unscaled_y_center <- rowMeans(cbind(geography$y_start, geography$y_end), na.rm = T)
#--------------------------------#
# Transform and scale predictors #
#--------------------------------#
# Transform predictors
predictors[predictors$predictor == "distance_PA", "value"] <- sqrt(
predictors[predictors$predictor == "distance_PA", "value"])
predictors[predictors$predictor == "human_pop_dens", "value"] <- log(
predictors[predictors$predictor == "human_pop_dens", "value"] + 1)
predictors[predictors$predictor == "deforestation_gaveau", "value"] <- sqrt(
predictors[predictors$predictor == "deforestation_gaveau", "value"])
predictors[predictors$predictor == "plantation_distance", "value"] <- log(
predictors[predictors$predictor == "plantation_distance", "value"] + 1)
predictors[predictors$predictor == "pulp_distance", "value"] <- log(
predictors[predictors$predictor == "pulp_distance", "value"] + 1)
predictors[predictors$predictor == "palm_distance", "value"] <- log(
predictors[predictors$predictor == "palm_distance", "value"] + 1)
# STARTING THE SCALING
# SCALE PREDICTORS
# these are the predictors that will be used in the model
predictor_names_for_scaling <- c( "dem", "slope", "temp_mean", "rain_dry", "rain_var",
"ou_killings", "ou_killing_prediction", "human_pop_dens",
"perc_muslim", "peatswamp", "lowland_forest", "lower_montane_forest" ,
"road_dens", "distance_PA", "fire_dens", "deforestation_hansen",
"deforestation_gaveau", "plantation_distance", "pulp_distance", "palm_distance",
"dom_T_OC", "dom_T_PH")
# predictors used in model
predictor_names <- c("year", "temp_mean", "rain_var", "rain_dry", "dom_T_OC",
"peatswamp", "lowland_forest",
"lower_montane_forest", "deforestation_hansen",
"human_pop_dens", "ou_killing_prediction",
"perc_muslim" )
"peatswamp", "lowland_forest",
"lower_montane_forest", "deforestation_hansen",
"human_pop_dens", "ou_killing_prediction",
"perc_muslim" )
predictors <- dplyr::select(predictors, id, predictor, unscaled_year = year,
unscaled_value = value)
# need to get rid of occurrence data
predictors <- predictors %>%
inner_join(transects, by = "id")
# exclude aerial if that is needed
if (include_aerial == F){
predictors <- filter(predictors, group != "aerial")
}
# SCALE PREDICTORD
for (predictor_name in predictor_names_for_scaling){
predictors[predictors$predictor == predictor_name,
"value" ] <-
as.numeric(as.vector(scale(predictors[predictors$predictor == predictor_name,
"unscaled_value"])))
}
predictors$year <- as.numeric(as.vector(scale(predictors[ , "unscaled_year"])))
geography <- dplyr::select(geography, -year)
# delete all rows that have zero
if(is_verbose){print("how many rows with na in scaled_value")
nrow(predictors[is.na(predictors$scaled_value), ])}
nrow(predictors[is.na(predictors$value), ])}
# deleting is.na values here
predictors <- predictors[!is.na(predictors$scaled_value), ]
predictors <- predictors[!is.na(predictors$value), ]
geography <- dplyr::select(geography, -c(year))
# Rename here1
predictors_obs <- predictors %>%
dplyr::filter(predictor %in% predictor_names) %>%
dcast(id + year ~ predictor, value.var = "scaled_value")%>%
dplyr::filter(predictor %in% predictor_names_for_scaling)%>%
dcast(id + year ~ predictor, value.var = "value") %>%
inner_join(geography, by = "id")%>%
dplyr::select(-group) %>%
inner_join(transects, by = "id" )
# work on aerial transects
# here we need to calculate first the aerial index (nests / km) and then transform it to nest-density
# then we include the offset without length_km * 2 * ESW
# for the transect this goes into the term
# we are also adding a conversion factor to have the response more similar,
# that we are going to add into the response and the offset
aerial_conversion_factor <- 100
aerial_predictors_obs <- dplyr::filter(predictors_obs, group == "aerial")
# Ai is aerial index (number of nests detected per kilometer of flight)
aerial_predictors_obs$AI <- aerial_predictors_obs$nr_nests /
aerial_predictors_obs$length_km
# calculate orangutan nest density from aerial index with formula 6 given in Ancrenaz et al., 2004
aerial_predictors_obs$nr_nests <- round(exp(4.7297 + 0.9796 *
log(aerial_predictors_obs$AI))
/ aerial_conversion_factor)
aerial_predictors_obs$ou_dens <- aerial_predictors_obs$nr_nests /
(aerial_predictors_obs$nest_decay * NCS * PNB)
aerial_predictors_obs$offset_term <- log( (1 * 1)/ aerial_conversion_factor *
aerial_predictors_obs$nest_decay *
NCS * PNB )
other_predictors_obs <- filter(predictors_obs, group != "aerial")
other_predictors_obs$ou_dens <- (other_predictors_obs$nr_nests/
(other_predictors_obs$length_km * ESW * 2)) *
(1/(other_predictors_obs$nest_decay * NCS * PNB))
other_predictors_obs$offset_term <- log(other_predictors_obs$length_km * ESW *
2 * other_predictors_obs$nest_decay *
NCS * PNB)
names_predictors_obs <- names(other_predictors_obs)
predictors_obs <- aerial_predictors_obs %>%
dplyr::select(id:length_km, nr_nests, nest_decay, ou_dens, offset_term)
if(is_verbose){ print("This has to be true:")
unique(names(predictors_obs) == names(other_predictors_obs))}
# HAS TO BE TRUE
predictors_obs <- predictors_obs %>%
bind_rows(other_predictors_obs) %>%
arrange(id) %>%
as.data.frame(.)
if (include_aerial == FALSE) {
predictors_obs <- filter(predictors_obs, group != "aerial")
}
# SCALE YEAR
scaled_year <- as.vector(scale(predictors_obs$year))
predictors_obs$unscaled_year <- predictors_obs$year
predictors_obs$year <- as.numeric(scaled_year)
predictors_obs_unscaled <- predictors %>%
dplyr::filter(predictor %in% predictor_names_for_scaling)%>%
dcast(id + unscaled_year ~ predictor, value.var = "unscaled_value")
# calculate x and y center
predictors_obs$x_center <- rowMeans(cbind(predictors_obs$x_start, predictors_obs$x_end), na.rm = T)
predictors_obs$y_center <- rowMeans(cbind(predictors_obs$y_start, predictors_obs$y_end), na.rm = T)
names(predictors_obs_unscaled)[-c(1,2)] <- paste0("unscaled_",
names(predictors_obs_unscaled)[-c(1,2)])
predictors_obs <- left_join(predictors_obs, predictors_obs_unscaled, by = "id")
# scale x and y center
predictors_obs$x_center <- as.numeric(as.vector(scale(predictors_obs[ ,
"unscaled_x_center"])))
predictors_obs$y_center <- as.numeric(as.vector(scale(predictors_obs[ ,
"unscaled_y_center"])))
# work on aerial transects
aerial_predictors_obs <- dplyr::filter(predictors_obs, group == "aerial")
# density
aerial_predictors_obs$ou_dens <- (aerial_predictors_obs$nr_nests/
(aerial_predictors_obs$length_km * ESW_aerial * 2)) *
(1/(aerial_predictors_obs$nest_decay * NCS * PNB))
# offset
aerial_predictors_obs$offset_term <- log(aerial_predictors_obs$length_km * ESW_aerial *
2 * aerial_predictors_obs$nest_decay *
NCS * PNB)
# work on ground transects
other_predictors_obs <- filter(predictors_obs, group != "aerial")
# density
other_predictors_obs$ou_dens <- (other_predictors_obs$nr_nests/
(other_predictors_obs$length_km * ESW * 2)) *
(1/(other_predictors_obs$nest_decay * NCS * PNB))
# offset
other_predictors_obs$offset_term <- log(other_predictors_obs$length_km * ESW *
2 * other_predictors_obs$nest_decay *
NCS * PNB)
names_predictors_obs <- names(other_predictors_obs)
# bind the two together
predictors_obs <- aerial_predictors_obs %>%
bind_rows(other_predictors_obs) %>%
arrange(id) %>%
dplyr::select(id, group, x_start:LU, length_km:nest_decay,
year, deforestation_gaveau:temp_mean, x_center, y_center,
unscaled_year:unscaled_temp_mean, unscaled_x_center,
unscaled_y_center, ou_dens, offset_term) %>%
as.data.frame(.)
if(is_verbose){print("look at predictors_obs")
str(predictors_obs)
summary(predictors_obs)}
str(predictors_obs)
summary(predictors_obs)}
# save this
# save the relevant output for the prediction and the validation
saveRDS(predictors_obs, file = file.path(outdir, paste0("predictors_obs_",
name_suffix,
Sys.Date(), ".rds")))
# now exclude the year that needs to be excluded
if (!is.na(exclude_year)){
......@@ -263,6 +333,8 @@ model_terms <- names(glm.nb(as.formula(paste("nr_nests~", paste(m_terms,
sep = "")),
data = predictors_obs,
control = glm.control(maxit = 250))$coefficients)
# prediction estimates
intercept <- rep(1, nrow(predictors_obs))
predictor_estimates <- cbind( intercept,
......@@ -286,8 +358,10 @@ model <- as.formula(
paste("nr_nests ~", full_model, "+ offset(offset_term)"))
res_full <- glm.nb(model, data = predictors_obs,
control = glm.control(maxit = 250))
# HERE I CAN NOW USE THE OTHER FUNCTION
dfbeta_frame <- data.frame(slope=res_full$coefficients, res_full$coefficients+
t(apply(X=dfbeta(res_full),
......@@ -406,11 +480,6 @@ c_set$w.aic <- NULL
results_out <- right_join(results_res, c_set, by="AIC")
results_out <- results_out[order(results_out$AIC), ]
if (is.na(exclude_year)){
name_suffix <- ""} else {
name_suffix <- paste0(exclude_year, "_")
}
# save the relevant output for the prediction and the validation
saveRDS(abundMod_results, file = file.path(outdir, paste0("abundMod_results_",
name_suffix,
......
......@@ -29,7 +29,7 @@ suppressMessages(registerDoParallel(cl))
#-----------------------------#
print(paste("Start abundance_prediction script", Sys.time()))
option_list <- list (
make_option(c("-i", "--input-directory"), dest = "input_directory",
type = "character", help = "directory with input files",
......@@ -119,11 +119,15 @@ if(is_verbose){print(paste(Sys.time(), "0. start run"))}
# command line arguments #
#------------------------#
indir_fun <- "~/orangutan_density_distribution/src/functions/"
crs_aea <- "+proj=aea +lat_1=7 +lat_2=-32 +lat_0=-15 +lon_0=125 +x_0=0
+y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs" # final projection
if (is.na(exclude_year)){
name_suffix <- ""} else {
name_suffix <- paste0(exclude_year, "_")
}
#-------------------------------#
# Load and prepare coefficients #
......@@ -167,19 +171,93 @@ predictor_names <- predictor_names_coeffs[!grepl("I(*)", predictor_names_coeffs)
#----------------------------#
# Load and prepare estimates #
#----------------------------#
if(is_verbose){print(paste("these are predictor names: ", predictor_names))}
geography_path <- path.to.current(indir, paste0("geography_",
year_to_predict),"rds")
if(is_verbose){print(paste("this is geography path", geography_path))}
geography <- readRDS(geography_path)
predictors_path <- path.to.current(indir_predictors, paste0("predictors_abundance_",
year_to_predict),"rds")
if(is_verbose){print(paste("this is predictors path", predictors_path))}
predictors <- readRDS(predictors_path) %>%
dplyr::filter(predictor %in% predictor_names) %>%
dcast(id + z_year ~ predictor, value.var = "scaled_value")
predictors <- readRDS(predictors_path)
predictors_obs_path <- path.to.current(indir_predictors,
paste0("predictors_obs_", name_suffix),
"rds")
if(is_verbose){print(paste("this is predictors-obs path", predictors_obs_path))}
predictors_obs <- readRDS(predictors_obs_path)
# Scale the grid-predictors using mean and sd of predictors_obs
# predictors for scaling
predictor_names_for_scaling <- c( "dem", "slope", "temp_mean", "rain_dry", "rain_var",
"ou_killings", "ou_killing_prediction", "human_pop_dens",
"perc_muslim", "peatswamp", "lowland_forest", "lower_montane_forest" ,
"road_dens", "distance_PA", "fire_dens", "deforestation_hansen",
"deforestation_gaveau", "plantation_distance", "pulp_distance", "palm_distance",
"dom_T_OC", "dom_T_PH")
# predictors used in model
predictor_names <- c("year", "temp_mean", "rain_var", "rain_dry", "dom_T_OC",
"peatswamp", "lowland_forest",
"lower_montane_forest", "deforestation_hansen",
"human_pop_dens", "ou_killing_prediction",
"perc_muslim" )
predictors <- rename(predictors, unscaled_value = value)
# calculate x and y center
geography$unscaled_x_center <- rowMeans(cbind(geography$x_start, geography$x_end), na.rm = T)
geography$unscaled_y_center <- rowMeans(cbind(geography$y_start, geography$y_end), na.rm = T)
predictors$year <- predictors$z_year
predictors$z_year <- NULL
if(is_verbose){str(predictors)}
for (predictor_name in predictor_names_for_scaling){
mean_predictor_obs <- mean(predictors_obs[ , paste0("unscaled_", predictor_name)], na.rm = T)
sd_predictor_obs <- mean(predictors_obs[ , paste0("unscaled_", predictor_name)], na.rm = T)
predictors[predictors$predictor == predictor_name,
"value" ] <- (predictors[predictors$predictor == predictor_name,
"unscaled_value" ] - mean_predictor_obs) /
sd_predictor_obs
}
# cast it to wide
predictors_grid <- dplyr::filter(predictors, predictor %in% predictor_names_for_scaling) %>%
dcast(id + year ~ predictor, value.var = "value") %>%
dplyr::select(-year)
predictors_grid_unscaled <- dplyr::filter(predictors, predictor %in% predictor_names_for_scaling ) %>%
dcast(id + year ~ predictor, value.var = "unscaled_value") %>%
rename(unscaled_year = year)
names(predictors_grid_unscaled)[-c(1,2)] <- paste0("unscaled_", names(predictors_grid_unscaled)[-c(1, 2)])
# join with geography to have x and y-center
predictors_grid <- predictors_grid %>%
left_join(predictors_grid_unscaled, by = "id") %>%
left_join(geography, by = "id")
if(is_verbose){print(paste("this is nrow predictors", nrow(predictors)))}
# year and x- and y-center
additional_predictors <- c("year", "x_center", "y_center")
for (predictor_name in additional_predictors){
mean_predictor_obs <- mean(predictors_obs[ , paste0("unscaled_", predictor_name)], na.rm = T)
sd_predictor_obs <- mean(predictors_obs[ , paste0("unscaled_", predictor_name)], na.rm = T)
predictors_grid[ ,
predictor_name] <- (predictors_grid[ ,
paste0("unscaled_", predictor_name) ] - mean_predictor_obs) /
sd_predictor_obs
}
# RENAME PREDICTORS INTO PREDICTORS_GRID
#--------------------------#
# PREDICTION FOR each year #
......@@ -188,10 +266,10 @@ if(is_verbose){print(paste("this is nrow predictors", nrow(predictors)))}
# in the coefficients
# HERE PAY ATTENTION FOR INTERACTIONS
# AND IF THERE IS MORE THAN ONE QUADRATIC TERM
intercept <- rep(1, nrow(predictors))
intercept <- rep(1, nrow(predictors_grid))
predictor_estimates <- cbind( intercept,
predictors[ , predictor_names],
predictors[ ,quadratic_terms_names] * predictors[ ,quadratic_terms_names])
predictors_grid[ , predictor_names],
predictors_grid[ ,quadratic_terms_names] * predictors_grid[ ,quadratic_terms_names])
names(predictor_estimates) <- c("intercept", predictor_names,
paste0("I(", quadratic_terms_names, "^2)"))
......@@ -205,19 +283,19 @@ names(predictor_estimates) <- c("intercept", predictor_names,
if(is_verbose){print(paste("1. start pred_per_cell", Sys.time()))}
pred_per_cell <- foreach(i = 1:nrow(predictor_estimates), .combine = c) %dopar% {
# pred_per_cell <- foreach(i = 1:100, .combine = c) %dopar% {
t_predictor_estimates <- t( predictor_estimates[i, ])
pred_estimates_wcoeffs <- data.frame(mapply(`*`, coeffs, t_predictor_estimates, SIMPLIFY = F))
pred_estimates_sum <- apply(pred_estimates_wcoeffs, 1, sum)
pred_estimates_weighted <- pred_estimates_sum * abundMod_results$w_aic
pred_estimates_calc <- sum(pred_estimates_weighted)
return(exp(pred_estimates_calc))
# pred_per_cell <- foreach(i = 1:100, .combine = c) %dopar% {
t_predictor_estimates <- t( predictor_estimates[i, ])
pred_estimates_wcoeffs <- data.frame(mapply(`*`, coeffs, t_predictor_estimates, SIMPLIFY = F))
pred_estimates_sum <- apply(pred_estimates_wcoeffs, 1, sum)
pred_estimates_weighted <- pred_estimates_sum * abundMod_results$w_aic
pred_estimates_calc <- sum(pred_estimates_weighted)
return(exp(pred_estimates_calc))
}
if(is_verbose){print(paste(Sys.time(), "2. finished dopar loop"))}
# is this correct -> ????
pred_per_cell <- as.data.frame(cbind(predictors$id, pred_per_cell))
pred_per_cell <- as.data.frame(cbind(predictors_grid$id, pred_per_cell))
names(pred_per_cell) <- c("id", "abundance_pred")
# exclude NAN
......@@ -264,15 +342,15 @@ geography_grid <- readRDS(geography_grid_path)
geography_grid_for_join <- dplyr::select(geography_grid, id, x_start, y_start)
pred_per_cell_sp <- left_join(geography_grid_for_join,
pred_per_cell, by = "id") %>%
as.data.frame()
pred_per_cell, by = "id") %>%
as.data.frame()
pred_per_cell_sp <- SpatialPointsDataFrame(coords =
cbind(pred_per_cell_sp$x_start,
pred_per_cell_sp$y_start),
data = pred_per_cell_sp,
proj4string = CRS(crs_aea), match.ID = T)
cbind(pred_per_cell_sp$x_start,
pred_per_cell_sp$y_start),
data = pred_per_cell_sp,
proj4string = CRS(crs_aea), match.ID = T)
writeOGR(pred_per_cell_sp, dsn = outdir,
layer = paste0("prediction_shp_",
......@@ -305,3 +383,5 @@ save.image(file.path(outdir, paste0("abundance_pred_image_", name_suffix,
print(paste("Finish model_prediction script for year", year_to_predict,
Sys.time()))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment