Skip to content
Snippets Groups Projects
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
data_prep.R 5.28 KiB
# Prepare variables that we want to use

database_full <- database_full %>% rename(Gender = "Q03W123", Education = "Q06W123", HHSize = "Q41W123",
                                  WorkingTime = "Q44W123", Birthyear = "Q01W123", Rent_net = "Q07W123",
                                  Number_Kids = "Q42W123", Employment_type = "Q43W123", Conseq_UGS = "Q28W3",
                                  Conseq_Money = "Q29W3") 


database_full <- database_full %>% mutate(Gender = dplyr::recode(Gender, "A1" = 1, "A2" = 2, "A3"=3),
                                Education = dplyr::recode(Education, "A1" = 1, "A2" = 2, "A3"=3, "A4" = 4, "A5" = 5),
                                Employment_type = dplyr::recode(Employment_type, "A1" = 1, "A2" = 2, "A3"=3, "A4" = 4, 
                                                         "A5" = 5, "A6" = 6),
                                Conseq_UGS = dplyr::recode(Conseq_UGS, "A1" = 5, "A2" = 4, "A3"=3, "A4" = 2, "A5" = 1, "A6" = NA_real_),
                                Conseq_Money = dplyr::recode(Conseq_Money, "A1" = 5, "A2" = 4, "A3"=3, "A4" = 2, "A5" = 1, "A6" = NA_real_))

database_full <- database_full %>% mutate(Gender_female = case_when(Gender == 2 ~1, TRUE~0),
                                          Age = 2023-Birthyear,
                                          Uni_degree = case_when(Education == 5 ~1, TRUE~0),
                                          Kids_Dummy = case_when(Number_Kids > 0 ~ 1, TRUE ~0),
                                          Employment_full = case_when(Employment_type == 1 ~ 1, TRUE~0),
                                          Pensioner = case_when(Employment_type == 6 ~ 1, TRUE~0),
                                          Age_mean = Age - mean(Age))

# Data cleaning 



database_full <- database_full %>% filter(Rent_SQ <= 10000 & Rent_SQ >=50) %>%
  filter(WalkingDistance_SQ > 0 & WalkingDistance_SQ <= 300) %>% 
  filter(Gender!=3)

database <- database %>% filter(Rent_SQ <= 10000 & Rent_SQ >=50) %>% 
  filter(WalkingDistance_SQ > 0 & WalkingDistance_SQ <= 300)


summary(database_full$interviewtime)

database_full <- database_full %>% filter(interviewtime >= 300) # make change time to 10 seconds?


database_full <- database_full %>%
  filter(!is.na(Treatment_new)) %>%
  mutate(Treatment_A = case_when(
    Treatment == 1 ~ "Treated",
    Treatment == 2 ~ "Vol_Treated",
    Treatment == 3 ~ "Not_Treated",
    TRUE ~ NA_character_
  )) %>% 
  mutate(Treatment_B = case_when(
    Treatment_new == 1 | Treatment_new == 2 | Treatment_new == 4 | Treatment_new == 5 ~ "Treated",
    Treatment_new == 3 | Treatment_new == 6 ~ "Not_Treated"
  )) %>% 
  mutate(Treatment_C = case_when(
    Treatment_new == 1 ~ 'Video 1',
    Treatment_new == 2 ~ 'No Video 1',
    Treatment_new == 3 ~ 'No Info 2',
    Treatment_new == 4 ~ 'No Video 2',
    Treatment_new == 5 ~ 'Video 2',
    Treatment_new == 6 ~ 'No Treatment 3',
    TRUE ~ NA_character_
  ))

id_list <- unique(database_full$id)

# Do we sill want to use this? or only database full?
database <- database %>% filter(id %in% id_list) %>% filter(!is.na(Treatment_new))
# Building NR Index

for (i in 1:21) {
  variable_name <- paste0("Q38S", sprintf("%02d", i), "W3")  # Generate variable name
  cat("Table for", variable_name, ":\n")
  print(table(database_full[[variable_name]]))
  cat("\n")
  database_full[[variable_name]] <- as.numeric(factor(database_full[[variable_name]], levels = c("A1", "A2", "A3", "A4", "A5")))
  cat("Table for", variable_name, ":\n")
  print(table(database_full[[variable_name]]))
  cat("\n")
}

variables_to_reverse <- c("Q38S02W3", "Q38S03W3", "Q38S10W3", "Q38S11W3", "Q38S13W3", "Q38S14W3", "Q38S15W3", "Q38S18W3")
for (variable_name in variables_to_reverse) {
  cat("Table for", variable_name, ":\n")
  
  # Convert the variable to a factor with numerical levels and reverse the scores
  database_full[[variable_name]] <- 6 - as.numeric(database_full[[variable_name]])
  
  # Print the table
  print(table(database_full[[variable_name]]))
  cat("\n")
}
q38_variables <- grep("^Q38", names(database_full), value = TRUE)
database_full$Total_NR <- rowSums(database_full[q38_variables])
hist(database_full$Total_NR)
database_full <- database_full %>% 
  mutate(Mean_NR=Total_NR/21) 
mean_nr<-mean(database_full$Mean_NR, na.rm = TRUE)
sd_nr<-sd(database_full$Mean_NR, na.rm = TRUE)
database_full <- database_full %>% 
  mutate(Z_Mean_NR=(Mean_NR-mean_nr)/sd_nr)
database$Z_Mean_NR<- database_full$Z_Mean_NR
summary(database$Z_Mean_NR)

#Self-Reference Index

for (i in 8:10) {
  variable_name <- paste0("TV", sprintf("%02d", i), "W3")  # Generate variable name
  cat("Table for", variable_name, ":\n")
  print(table(database_full[[variable_name]]))
  cat("\n")
  database_full[[variable_name]] <- as.numeric(factor(database_full[[variable_name]], levels = c("A1", "A2", "A3", "A4", "A5")))
  cat("Table for", variable_name, ":\n")
  print(table(database_full[[variable_name]]))
  cat("\n")
}


database_full$Total_SR <- database_full$TV08W3+database_full$TV09W3+database_full$TV10W3
hist(database_full$Total_SR)
database_full <- database_full %>% 
  mutate(Mean_SR=Total_SR/3) 
mean_sr<-mean(database_full$Mean_SR, na.rm = TRUE)
sd_sr<-sd(database_full$Mean_SR, na.rm = TRUE)
database_full <- database_full %>% 
  mutate(Z_Mean_SR=(Mean_SR-mean_sr)/sd_sr)
database$Z_Mean_SR<- database_full$Z_Mean_SR
summary(database$Z_Mean_SR)