From 98d3c134b1e2569f5691b9029a7772727708a46c Mon Sep 17 00:00:00 2001 From: Nino Cavallaro <nino.cavallaro@idiv.de> Date: Fri, 21 Mar 2025 12:10:52 +0100 Subject: [PATCH] lapage test etc. --- Scripts/create_tables.R | 58 +++- Scripts/data_prep.R | 272 ++++++++--------- Scripts/descriptives.R | 439 +++++++++++++++++++++++++++ Scripts/lapage_results.R | 210 +++++++++++++ Scripts/lapage_results_naturalness.R | 372 +++++++++++++++++++++++ Scripts/logit/chr_vol_treat.R | 372 ++++++++++++----------- Scripts/ols/ols_consequentiality.R | 5 + Scripts/ols/ols_time_spent.R | 4 +- Scripts/visualize_distr_spli.R | 231 +++++++++----- 9 files changed, 1565 insertions(+), 398 deletions(-) create mode 100644 Scripts/descriptives.R create mode 100644 Scripts/lapage_results.R create mode 100644 Scripts/lapage_results_naturalness.R diff --git a/Scripts/create_tables.R b/Scripts/create_tables.R index a56556e..8f0c0ba 100644 --- a/Scripts/create_tables.R +++ b/Scripts/create_tables.R @@ -11,6 +11,9 @@ list_ols <- list("(Intercept)" = "Intercept", "as.factor(Treatment_A)Treated" = "as.factor(Treatment_C)No Video 2" = "Text 2", "as.factor(Treatment_C)Video 1" = "Video 1", "as.factor(Treatment_C)Video 2" = "Video 2", "as.factor(Treatment_D)Treated" = "Treated", "as.factor(Treatment_D)Vol. Treated" = "Vol. Treated", "as.factor(Treatment_D)No Info 2" = "No Info", + "as.factor(Matching_Group)Control Pred" = "Control Predicted", "as.factor(Matching_Group)Opt Not Pred" = "Opt. Treatment Not Predicted", + "as.factor(Matching_Group)Opt Pred" = "Opt. Treatment Predicted", "as.factor(Matching_Group)Treat Not Pred" = "Treated Not Predicted", + "as.factor(Matching_Group)Treat Pred" = "Treated Predicted", "Z_Mean_NR" = "NR-Index", "as.factor(Gender)2" = "Female", "Age_mean" = "Age", "QFIncome" = "Income", "Uni_degree" = "University Degree") # OLS A @@ -49,6 +52,25 @@ texreg(l=list(ols_time_spent_control_D, ols_time_cc_control_D, ols_percentage_co caption = "Results of OLS regressions for Scenario Case B.", file="Tables/ols/ols_D.tex") +# OLS G +texreg(l=list(ols_time_spent_control_G, ols_time_cc_control_G, ols_percentage_correct_control_G, conseq_model_control_G), + custom.model.names = c("Interview Time", "CC Time", "Quiz", "Cons. Score"), + custom.header = list("Model 1B" = 1:1, "Model 2B" = 2:2, "Model 3B" = 3:3, "Model 4B" = 4:4), + custom.coef.map = list_ols, stars = c(0.01, 0.05, 0.1), float.pos="tb", + custom.note = "Notes: (i) The dependent variables examined in this regression analysis are as follows: + Model 1B refers to net interview time, + Model 2B denotes choice card time, Model 1B represents the percentage of correct quiz questions, and Model 4B represents consequentiality score. + (ii) The variables included in the analysis are as follows: Treated is a dummy variable indicating membership + in the obligatory treated group, Vol. Treated is a dummy variable indicating the group that voluntarily chose the optional treatment, + while No Info indicates the group that did not opt for the treatment, with the reference group being Non-Treated. NR-Index represents the z-standardized + natural relatedness index. Female is a dummy variable denoting gender, Age has been mean-centered and measured in years, + Income is a continuous variable indicating a transition from one income group to the next higher, and University Degree is + a dummy variable indicating whether an individual holds a university degree; (iii) %stars and standard errors in parentheses.", + label = "tab:olsD", + caption = "Results of OLS regressions for Scenario Case B.", + file="Tables/ols/ols_G.tex") + + # Manipulation check texreg(l=list(ols_percentage_correct_A, ols_percentage_correct_control_A, ols_percentage_correct_C, ols_percentage_correct_control_C), custom.model.names = c("Case A", "with Controls", "Case B", "with Controls"), @@ -60,6 +82,7 @@ texreg(l=list(ols_percentage_correct_A, ols_percentage_correct_control_A, ols_p file="Tables/ols/manipulation.tex") + # Net interview time texreg(l=list(ols_time_spent_A, ols_time_spent_control_A, ols_time_spent_C, ols_time_spent_control_C), custom.model.names = c("Case A", "with Controls", "Case B", "with Controls"), @@ -241,7 +264,7 @@ texreg(c(case_B_cols_NR[1], remGOF(case_B_cols_NR[2:6])), caption = "Results of mixed logit model with treatment interactions for Case B including NR-index.", file="Tables/mxl/case_B_rent_INT_NR.tex") - +#### Case A split samples ##### mxl_tr <- apollo_loadModel("Estimation_results/mxl/Split_samples/MXL_wtp Treated A") mxl_vol_tr <- apollo_loadModel("Estimation_results/mxl/Split_samples/MXL_wtp Vol_Treated A") mxl_not_tr <- apollo_loadModel("Estimation_results/mxl/Split_samples/MXL_wtp Not_Treated A") @@ -257,12 +280,41 @@ texreg(c(not_tr, treated, opt_tr), "ASC_sq_info" = "ASC SQ", "rent_info" = "Rent", "nat_info" = "Naturalness", "walking_info" = "Walking Distance", "sig_natural" = "Naturalness SD", "sig_walking" = "Walking Distance SD", "sig_rent" = "Rent SD", "sig_ASC_1" = "ASC 1 SD", "sig_ASC_2" = "ASC 2 SD"), - custom.model.names = c("Not Treated", "Treated", "Optional Treatment"), custom.note = "%stars (one-sided). Robust standard errors in parentheses.", + custom.model.names = c("Control", "Treated", "Optional Treatment"), custom.note = "%stars (one-sided). Robust standard errors in parentheses.", stars = c(0.01, 0.05, 0.1), float.pos="tb", - label = "tab:mxl_C_NR", + label = "tab:mxl_A_split", caption = "Results of mixed logit models for case A.", file="Tables/mxl/case_A_splits.tex") + +#### Matching split samples ##### +MXL_wtp_Control_Not_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Control_Not_Pred") +MXL_wtp_Control_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Control_Pred") +MXL_wtp_Opt_Not_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Opt_Not_Pred") +MXL_wtp_Opt_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Opt_Pred") +MXL_wtp_Treated_Not_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Treated_Not_Pred") +MXL_wtp_Treated_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Treated_Pred") + +control_not_pred <- quicktexregapollo(MXL_wtp_Control_Not_Pred_model) +control_pred <- quicktexregapollo(MXL_wtp_Control_Pred_model) +treat_not_pred <- quicktexregapollo(MXL_wtp_Treated_Not_Pred_model) +treat_pred <- quicktexregapollo(MXL_wtp_Treated_Pred_model) +opt_not_pred <- quicktexregapollo(MXL_wtp_Opt_Not_Pred_model) +opt_pred <- quicktexregapollo(MXL_wtp_Opt_Pred_model) + +texreg(c(control_not_pred, control_pred, treat_not_pred, treat_pred, opt_not_pred, opt_pred), + custom.coef.map = list("mu_natural" = "Naturalness", "mu_walking" = "Walking Distance", "mu_rent" = "Rent", + "ASC_1" = "ASC 1","ASC_2" = "ASC 2", "_natural" = "Naturalness", "nat" = "Naturalness", + "wd" = "Walking Distance", "asc" = "ASC SQ", "ASC" ="ASC SQ", + "ASC_sq_info" = "ASC SQ", "rent_info" = "Rent", "nat_info" = "Naturalness", "walking_info" = "Walking Distance", + "sig_natural" = "Naturalness SD", "sig_walking" = "Walking Distance SD", "sig_rent" = "Rent SD", + "sig_ASC_1" = "ASC 1 SD", "sig_ASC_2" = "ASC 2 SD"), + custom.model.names = c("Control Not Predicted", "Control Predicted", "Treated Not Predicted", "Treated Predicted", "Opt. Treatment Not Predicted", "Opt. Treatment Predicted"), custom.note = "%stars (one-sided). Robust standard errors in parentheses.", + stars = c(0.01, 0.05, 0.1), float.pos="tb", + label = "tab:mxl_B_Match", + caption = "Results of mixed logit models for case B.", + file="Tables/mxl/case_B_splits.tex") + # Main model # texreg(l=list(mxl_wtp_case_a_rentINT), # custom.coef.map = list("mu_natural" = "Naturalness", "mu_walking" = "Walking Distance", "mu_rent" = "Rent", diff --git a/Scripts/data_prep.R b/Scripts/data_prep.R index 9094278..8c88b1f 100644 --- a/Scripts/data_prep.R +++ b/Scripts/data_prep.R @@ -1,136 +1,136 @@ -# Prepare variables that we want to use - -database_full <- database_full %>% rename(Gender = "Q03W123", Education = "Q06W123", HHSize = "Q41W123", - WorkingTime = "Q44W123", Birthyear = "Q01W123", Rent_net = "Q07W123", - Number_Kids = "Q42W123", Employment_type = "Q43W123", Conseq_UGS = "Q28W3", - Conseq_Money = "Q29W3", UGS_visits = "Q23W123") - - -database_full <- database_full %>% mutate(Gender = dplyr::recode(Gender, "A1" = 1, "A2" = 2, "A3"=3), - Education = dplyr::recode(Education, "A1" = 1, "A2" = 2, "A3"=3, "A4" = 4, "A5" = 5), - Employment_type = dplyr::recode(Employment_type, "A1" = 1, "A2" = 2, "A3"=3, "A4" = 4, - "A5" = 5, "A6" = 6), - Conseq_UGS = dplyr::recode(Conseq_UGS, "A1" = 5, "A2" = 4, "A3"=3, "A4" = 2, "A5" = 1, "A6" = NA_real_), - Conseq_Money = dplyr::recode(Conseq_Money, "A1" = 5, "A2" = 4, "A3"=3, "A4" = 2, "A5" = 1, "A6" = NA_real_), - UGS_visits = dplyr::recode(UGS_visits, "A1" = 1, "A2" = 2, "A3"=3, "A4" = 4, "A5" = 5, "A6" = 6, - "A7" = 7, "A8" = 8, "A9" = 9, "A10"= 10)) - -database_full <- database_full %>% mutate(Gender_female = case_when(Gender == 2 ~1, TRUE~0), - Age = 2023-Birthyear, - Uni_degree = case_when(Education == 5 ~1, TRUE~0), - Kids_Dummy = case_when(Number_Kids > 0 ~ 1, TRUE ~0), - Employment_full = case_when(Employment_type == 1 ~ 1, TRUE~0), - Pensioner = case_when(Employment_type == 6 ~ 1, TRUE~0), - Age_mean = Age - mean(Age), - Income_mean = QFIncome - mean(QFIncome)) - -# Data cleaning - - - -database_full <- database_full %>% filter(Rent_SQ <= 10000 & Rent_SQ >=50) %>% - filter(WalkingDistance_SQ > 0 & WalkingDistance_SQ <= 300) %>% - filter(Gender!=3) - -database <- database %>% filter(Rent_SQ <= 10000 & Rent_SQ >=50) %>% - filter(WalkingDistance_SQ > 0 & WalkingDistance_SQ <= 300) - - -summary(database_full$interviewtime) - -database_full <- database_full %>% filter(interviewtime >= 300) # make change time to 10 seconds? - - -database_full <- database_full %>% - filter(!is.na(Treatment_new)) %>% - mutate(Treatment_A = case_when( - Treatment == 1 ~ "Treated", - Treatment == 2 ~ "Vol_Treated", - Treatment == 3 ~ "Not_Treated", - TRUE ~ NA_character_ - )) %>% - mutate(Treatment_B = case_when( - Treatment_new == 1 | Treatment_new == 2 | Treatment_new == 4 | Treatment_new == 5 ~ "Treated", - Treatment_new == 3 | Treatment_new == 6 ~ "Not_Treated" - )) %>% - mutate(Treatment_C = case_when( - Treatment_new == 1 ~ 'Video 1', - Treatment_new == 2 ~ 'No Video 1', - Treatment_new == 3 ~ 'No Info 2', - Treatment_new == 4 ~ 'No Video 2', - Treatment_new == 5 ~ 'Video 2', - Treatment_new == 6 ~ 'No Treatment 3', - TRUE ~ NA_character_ - )) %>% - mutate(Treatment_D = case_when( - Treatment_new == 1 | Treatment_new == 2 ~ 'Treated', - Treatment_new == 3 ~ 'No Info 2', - Treatment_new == 4 | Treatment_new == 5 ~ 'Vol. Treated', - Treatment_new == 6 ~ 'No Treatment 3', - TRUE ~ NA_character_ - )) - -id_list <- unique(database_full$id) - -# Do we sill want to use this? or only database full? -database <- database %>% filter(id %in% id_list) %>% filter(!is.na(Treatment_new)) -# Building NR Index - -for (i in 1:21) { - variable_name <- paste0("Q38S", sprintf("%02d", i), "W3") # Generate variable name - cat("Table for", variable_name, ":\n") - print(table(database_full[[variable_name]])) - cat("\n") - database_full[[variable_name]] <- as.numeric(factor(database_full[[variable_name]], levels = c("A1", "A2", "A3", "A4", "A5"))) - cat("Table for", variable_name, ":\n") - print(table(database_full[[variable_name]])) - cat("\n") -} - -variables_to_reverse <- c("Q38S02W3", "Q38S03W3", "Q38S10W3", "Q38S11W3", "Q38S13W3", "Q38S14W3", "Q38S15W3", "Q38S18W3") -for (variable_name in variables_to_reverse) { - cat("Table for", variable_name, ":\n") - - # Convert the variable to a factor with numerical levels and reverse the scores - database_full[[variable_name]] <- 6 - as.numeric(database_full[[variable_name]]) - - # Print the table - print(table(database_full[[variable_name]])) - cat("\n") -} -q38_variables <- grep("^Q38", names(database_full), value = TRUE) -database_full$Total_NR <- rowSums(database_full[q38_variables]) -hist(database_full$Total_NR) -database_full <- database_full %>% - mutate(Mean_NR=Total_NR/21) -mean_nr<-mean(database_full$Mean_NR, na.rm = TRUE) -sd_nr<-sd(database_full$Mean_NR, na.rm = TRUE) -database_full <- database_full %>% - mutate(Z_Mean_NR=(Mean_NR-mean_nr)/sd_nr) -database$Z_Mean_NR<- database_full$Z_Mean_NR -summary(database$Z_Mean_NR) - -#Self-Reference Index - -for (i in 8:10) { - variable_name <- paste0("TV", sprintf("%02d", i), "W3") # Generate variable name - cat("Table for", variable_name, ":\n") - print(table(database_full[[variable_name]])) - cat("\n") - database_full[[variable_name]] <- as.numeric(factor(database_full[[variable_name]], levels = c("A1", "A2", "A3", "A4", "A5"))) - cat("Table for", variable_name, ":\n") - print(table(database_full[[variable_name]])) - cat("\n") -} - - -database_full$Total_SR <- database_full$TV08W3+database_full$TV09W3+database_full$TV10W3 -hist(database_full$Total_SR) -database_full <- database_full %>% - mutate(Mean_SR=Total_SR/3) -mean_sr<-mean(database_full$Mean_SR, na.rm = TRUE) -sd_sr<-sd(database_full$Mean_SR, na.rm = TRUE) -database_full <- database_full %>% - mutate(Z_Mean_SR=(Mean_SR-mean_sr)/sd_sr) -database$Z_Mean_SR<- database_full$Z_Mean_SR -summary(database$Z_Mean_SR) +# Prepare variables that we want to use + +database_full <- database_full %>% rename(Gender = "Q03W123", Education = "Q06W123", HHSize = "Q41W123", + WorkingTime = "Q44W123", Birthyear = "Q01W123", Rent_net = "Q07W123", + Number_Kids = "Q42W123", Employment_type = "Q43W123", Conseq_UGS = "Q28W3", + Conseq_Money = "Q29W3", UGS_visits = "Q23W123") + + +database_full <- database_full %>% mutate(Gender = dplyr::recode(Gender, "A1" = 1, "A2" = 2, "A3"=3), + Education = dplyr::recode(Education, "A1" = 1, "A2" = 2, "A3"=3, "A4" = 4, "A5" = 5), + Employment_type = dplyr::recode(Employment_type, "A1" = 1, "A2" = 2, "A3"=3, "A4" = 4, + "A5" = 5, "A6" = 6), + Conseq_UGS = dplyr::recode(Conseq_UGS, "A1" = 5, "A2" = 4, "A3"=3, "A4" = 2, "A5" = 1, "A6" = NA_real_), + Conseq_Money = dplyr::recode(Conseq_Money, "A1" = 5, "A2" = 4, "A3"=3, "A4" = 2, "A5" = 1, "A6" = NA_real_), + UGS_visits = dplyr::recode(UGS_visits, "A1" = 1, "A2" = 2, "A3"=3, "A4" = 4, "A5" = 5, "A6" = 6, + "A7" = 7, "A8" = 8, "A9" = 9, "A10"= 10)) + +database_full <- database_full %>% mutate(Gender_female = case_when(Gender == 2 ~1, TRUE~0), + Age = 2023-Birthyear, + Uni_degree = case_when(Education == 5 ~1, TRUE~0), + Kids_Dummy = case_when(Number_Kids > 0 ~ 1, TRUE ~0), + Employment_full = case_when(Employment_type == 1 ~ 1, TRUE~0), + Pensioner = case_when(Employment_type == 6 ~ 1, TRUE~0), + Age_mean = Age - mean(Age), + Income_mean = QFIncome - mean(QFIncome)) + +# Data cleaning + + + +database_full <- database_full %>% filter(Rent_SQ <= 10000 & Rent_SQ >=50) %>% + filter(WalkingDistance_SQ > 0 & WalkingDistance_SQ <= 300) %>% + filter(Gender!=3) + +database <- database %>% filter(Rent_SQ <= 10000 & Rent_SQ >=50) %>% + filter(WalkingDistance_SQ > 0 & WalkingDistance_SQ <= 300) + + +summary(database_full$interviewtime) + +database_full <- database_full %>% filter(interviewtime >= 300) # make change time to 10 seconds? + + +database_full <- database_full %>% + filter(!is.na(Treatment_new)) %>% + mutate(Treatment_A = case_when( + Treatment == 1 ~ "Treated", + Treatment == 2 ~ "Vol_Treated", + Treatment == 3 ~ "Not_Treated", + TRUE ~ NA_character_ + )) %>% + mutate(Treatment_B = case_when( + Treatment_new == 1 | Treatment_new == 2 | Treatment_new == 4 | Treatment_new == 5 ~ "Treated", + Treatment_new == 3 | Treatment_new == 6 ~ "Not_Treated" + )) %>% + mutate(Treatment_C = case_when( + Treatment_new == 1 ~ 'Video 1', + Treatment_new == 2 ~ 'No Video 1', + Treatment_new == 3 ~ 'No Info 2', + Treatment_new == 4 ~ 'No Video 2', + Treatment_new == 5 ~ 'Video 2', + Treatment_new == 6 ~ 'No Treatment 3', + TRUE ~ NA_character_ + )) %>% + mutate(Treatment_D = case_when( + Treatment_new == 1 | Treatment_new == 2 ~ 'Treated', + Treatment_new == 3 ~ 'No Info 2', + Treatment_new == 4 | Treatment_new == 5 ~ 'Vol. Treated', + Treatment_new == 6 ~ 'No Treatment 3', + TRUE ~ NA_character_ + )) + +id_list <- unique(database_full$id) + +# Do we sill want to use this? or only database full? +database <- database %>% filter(id %in% id_list) %>% filter(!is.na(Treatment_new)) +# Building NR Index + +for (i in 1:21) { + variable_name <- paste0("Q38S", sprintf("%02d", i), "W3") # Generate variable name + cat("Table for", variable_name, ":\n") + print(table(database_full[[variable_name]])) + cat("\n") + database_full[[variable_name]] <- as.numeric(factor(database_full[[variable_name]], levels = c("A1", "A2", "A3", "A4", "A5"))) + cat("Table for", variable_name, ":\n") + print(table(database_full[[variable_name]])) + cat("\n") +} + +variables_to_reverse <- c("Q38S02W3", "Q38S03W3", "Q38S10W3", "Q38S11W3", "Q38S13W3", "Q38S14W3", "Q38S15W3", "Q38S18W3") +for (variable_name in variables_to_reverse) { + cat("Table for", variable_name, ":\n") + + # Convert the variable to a factor with numerical levels and reverse the scores + database_full[[variable_name]] <- 6 - as.numeric(database_full[[variable_name]]) + + # Print the table + print(table(database_full[[variable_name]])) + cat("\n") +} +q38_variables <- grep("^Q38", names(database_full), value = TRUE) +database_full$Total_NR <- rowSums(database_full[q38_variables]) +hist(database_full$Total_NR) +database_full <- database_full %>% + mutate(Mean_NR=Total_NR/21) +mean_nr<-mean(database_full$Mean_NR, na.rm = TRUE) +sd_nr<-sd(database_full$Mean_NR, na.rm = TRUE) +database_full <- database_full %>% + mutate(Z_Mean_NR=(Mean_NR-mean_nr)/sd_nr) +database$Z_Mean_NR<- database_full$Z_Mean_NR +summary(database$Z_Mean_NR) + +#Self-Reference Index + +for (i in 8:10) { + variable_name <- paste0("TV", sprintf("%02d", i), "W3") # Generate variable name + cat("Table for", variable_name, ":\n") + print(table(database_full[[variable_name]])) + cat("\n") + database_full[[variable_name]] <- as.numeric(factor(database_full[[variable_name]], levels = c("A1", "A2", "A3", "A4", "A5"))) + cat("Table for", variable_name, ":\n") + print(table(database_full[[variable_name]])) + cat("\n") +} + + +database_full$Total_SR <- database_full$TV08W3+database_full$TV09W3+database_full$TV10W3 +hist(database_full$Total_SR) +database_full <- database_full %>% + mutate(Mean_SR=Total_SR/3) +mean_sr<-mean(database_full$Mean_SR, na.rm = TRUE) +sd_sr<-sd(database_full$Mean_SR, na.rm = TRUE) +database_full <- database_full %>% + mutate(Z_Mean_SR=(Mean_SR-mean_sr)/sd_sr) +database$Z_Mean_SR<- database_full$Z_Mean_SR +summary(database$Z_Mean_SR) diff --git a/Scripts/descriptives.R b/Scripts/descriptives.R new file mode 100644 index 0000000..96497de --- /dev/null +++ b/Scripts/descriptives.R @@ -0,0 +1,439 @@ +# Descriptives + +source("Scripts/ols/ols_time_spent.R") +source("Scripts/ols/ols_quiz.R") +data <- left_join(data, select(quiz_data, id, percentage_correct), by = "id") + +data <- data %>% + mutate( + Control_Not_Pred = ifelse(Matching_Group == "Control Not Pred", 1, 0), + Control_Pred = ifelse(Matching_Group == "Control Pred", 1, 0), + Opt_Not_Pred = ifelse(Matching_Group == "Opt Not Pred", 1, 0), + Opt_Pred = ifelse(Matching_Group == "Opt Pred", 1, 0), + Treat_Not_Pred = ifelse(Matching_Group == "Treat Not Pred", 1, 0), + Treat_Pred = ifelse(Matching_Group == "Treat Pred", 1, 0), + Opt_Treatment = Dummy_Vol_Treated+Dummy_no_info, + Control = ifelse(Treatment_A=="Not_Treated",1,0) + ) + +data$Conseq_score<- data$Conseq_Money+data$Conseq_UGS +table(data$QFIncome) +data <- data %>% + mutate(income = ifelse(QFIncome == 0, NA, QFIncome)) +table(data$Dummy_Vol_Treated) +table(data$Dummy_no_info) +# Compute summary statistics for each group +summary_stats <- data %>% + summarise( + # Number of respondents per group + Full_Sample_n = n(), + Dummy_Treated_n = sum(Dummy_Treated, na.rm = TRUE), + Dummy_Vol_Treated_n = sum(Dummy_Vol_Treated, na.rm = TRUE), + Dummy_no_info_n = sum(Dummy_no_info, na.rm = TRUE), + Control_Not_Pred_n = sum(Control_Not_Pred, na.rm = TRUE), + Control_Pred_n = sum(Control_Pred, na.rm = TRUE), + Opt_Not_Pred_n = sum(Opt_Not_Pred, na.rm = TRUE), + Opt_Pred_n = sum(Opt_Pred, na.rm = TRUE), + Treat_Not_Pred_n = sum(Treat_Not_Pred, na.rm = TRUE), + Treat_Pred_n = sum(Treat_Pred, na.rm = TRUE), + Opt_Treatment_n = sum(Opt_Treatment, na.rm = TRUE), + Control_n = sum(Control, na.rm = TRUE), + + + # Gender (% Female) + Full_Sample_Gender = mean(Gender_female, na.rm = TRUE) * 100, + Dummy_Treated_Gender = mean(Gender_female[Dummy_Treated == 1], na.rm = TRUE) * 100, + Dummy_Vol_Treated_Gender = mean(Gender_female[Dummy_Vol_Treated == 1], na.rm = TRUE) * 100, + Dummy_no_info_Gender = mean(Gender_female[Dummy_no_info == 1], na.rm = TRUE) * 100, + Control_Not_Pred_Gender = mean(Gender_female[Control_Not_Pred == 1], na.rm = TRUE) * 100, + Control_Pred_Gender = mean(Gender_female[Control_Pred == 1], na.rm = TRUE) * 100, + Opt_Not_Pred_Gender = mean(Gender_female[Opt_Not_Pred == 1], na.rm = TRUE) * 100, + Opt_Pred_Gender = mean(Gender_female[Opt_Pred == 1], na.rm = TRUE) * 100, + Treat_Not_Pred_Gender = mean(Gender_female[Treat_Not_Pred == 1], na.rm = TRUE) * 100, + Treat_Pred_Gender = mean(Gender_female[Treat_Pred == 1], na.rm = TRUE) * 100, + Opt_Treatment_Gender = mean(Gender_female[Opt_Treatment == 1], na.rm = TRUE) * 100, + Control_Gender = mean(Gender_female[Control == 1], na.rm = TRUE) * 100, + + + # University Degree (%) + Full_Sample_UniDegree = mean(Uni_degree, na.rm = TRUE) * 100, + Dummy_Treated_UniDegree = mean(Uni_degree[Dummy_Treated == 1], na.rm = TRUE) * 100, + Dummy_Vol_Treated_UniDegree = mean(Uni_degree[Dummy_Vol_Treated == 1], na.rm = TRUE) * 100, + Dummy_no_info_UniDegree = mean(Uni_degree[Dummy_no_info == 1], na.rm = TRUE) * 100, + Control_Not_Pred_UniDegree = mean(Uni_degree[Control_Not_Pred == 1], na.rm = TRUE) * 100, + Control_Pred_UniDegree = mean(Uni_degree[Control_Pred == 1], na.rm = TRUE) * 100, + Opt_Not_Pred_UniDegree = mean(Uni_degree[Opt_Not_Pred == 1], na.rm = TRUE) * 100, + Opt_Pred_UniDegree = mean(Uni_degree[Opt_Pred == 1], na.rm = TRUE) * 100, + Treat_Not_Pred_UniDegree = mean(Uni_degree[Treat_Not_Pred == 1], na.rm = TRUE) * 100, + Treat_Pred_UniDegree = mean(Uni_degree[Treat_Pred == 1], na.rm = TRUE) * 100, + Opt_Treatment_UniDegree = mean(Uni_degree[Opt_Treatment == 1], na.rm = TRUE) * 100, + Control_UniDegree = mean(Uni_degree[Control == 1], na.rm = TRUE) * 100, + + # Income Proportion (% for each category 1,2,3,4) for all groups + Full_Sample_Income_1 = mean(income == 1, na.rm = TRUE) * 100, + Full_Sample_Income_2 = mean(income == 2, na.rm = TRUE) * 100, + Full_Sample_Income_3 = mean(income == 3, na.rm = TRUE) * 100, + Full_Sample_Income_4 = mean(income == 4, na.rm = TRUE) * 100, + + Dummy_Treated_Income_1 = mean(income[Dummy_Treated == 1] == 1, na.rm = TRUE) * 100, + Dummy_Treated_Income_2 = mean(income[Dummy_Treated == 1] == 2, na.rm = TRUE) * 100, + Dummy_Treated_Income_3 = mean(income[Dummy_Treated == 1] == 3, na.rm = TRUE) * 100, + Dummy_Treated_Income_4 = mean(income[Dummy_Treated == 1] == 4, na.rm = TRUE) * 100, + + Dummy_no_info_Income_1 = mean(income[Dummy_no_info == 1] == 1, na.rm = TRUE) * 100, + Dummy_no_info_Income_2 = mean(income[Dummy_no_info == 1] == 2, na.rm = TRUE) * 100, + Dummy_no_info_Income_3 = mean(income[Dummy_no_info == 1] == 3, na.rm = TRUE) * 100, + Dummy_no_info_Income_4 = mean(income[Dummy_no_info == 1] == 4, na.rm = TRUE) * 100, + + Dummy_Vol_Treated_Income_1 = mean(income[Dummy_Vol_Treated == 1] == 1, na.rm = TRUE) * 100, + Dummy_Vol_Treated_Income_2 = mean(income[Dummy_Vol_Treated == 1] == 2, na.rm = TRUE) * 100, + Dummy_Vol_Treated_Income_3 = mean(income[Dummy_Vol_Treated == 1] == 3, na.rm = TRUE) * 100, + Dummy_Vol_Treated_Income_4 = mean(income[Dummy_Vol_Treated == 1] == 4, na.rm = TRUE) * 100, + + Control_Not_Pred_Income_1 = mean(income[Control_Not_Pred == 1] == 1, na.rm = TRUE) * 100, + Control_Not_Pred_Income_2 = mean(income[Control_Not_Pred == 1] == 2, na.rm = TRUE) * 100, + Control_Not_Pred_Income_3 = mean(income[Control_Not_Pred == 1] == 3, na.rm = TRUE) * 100, + Control_Not_Pred_Income_4 = mean(income[Control_Not_Pred == 1] == 4, na.rm = TRUE) * 100, + + Control_Pred_Income_1 = mean(income[Control_Pred == 1] == 1, na.rm = TRUE) * 100, + Control_Pred_Income_2 = mean(income[Control_Pred == 1] == 2, na.rm = TRUE) * 100, + Control_Pred_Income_3 = mean(income[Control_Pred == 1] == 3, na.rm = TRUE) * 100, + Control_Pred_Income_4 = mean(income[Control_Pred == 1] == 4, na.rm = TRUE) * 100, + + Opt_Not_Pred_Income_1 = mean(income[Opt_Not_Pred == 1] == 1, na.rm = TRUE) * 100, + Opt_Not_Pred_Income_2 = mean(income[Opt_Not_Pred == 1] == 2, na.rm = TRUE) * 100, + Opt_Not_Pred_Income_3 = mean(income[Opt_Not_Pred == 1] == 3, na.rm = TRUE) * 100, + Opt_Not_Pred_Income_4 = mean(income[Opt_Not_Pred == 1] == 4, na.rm = TRUE) * 100, + + Opt_Pred_Income_1 = mean(income[Opt_Pred == 1] == 1, na.rm = TRUE) * 100, + Opt_Pred_Income_2 = mean(income[Opt_Pred == 1] == 2, na.rm = TRUE) * 100, + Opt_Pred_Income_3 = mean(income[Opt_Pred == 1] == 3, na.rm = TRUE) * 100, + Opt_Pred_Income_4 = mean(income[Opt_Pred == 1] == 4, na.rm = TRUE) * 100, + + Treat_Not_Pred_Income_1 = mean(income[Treat_Not_Pred == 1] == 1, na.rm = TRUE) * 100, + Treat_Not_Pred_Income_2 = mean(income[Treat_Not_Pred == 1] == 2, na.rm = TRUE) * 100, + Treat_Not_Pred_Income_3 = mean(income[Treat_Not_Pred == 1] == 3, na.rm = TRUE) * 100, + Treat_Not_Pred_Income_4 = mean(income[Treat_Not_Pred == 1] == 4, na.rm = TRUE) * 100, + + Treat_Pred_Income_1 = mean(income[Treat_Pred == 1] == 1, na.rm = TRUE) * 100, + Treat_Pred_Income_2 = mean(income[Treat_Pred == 1] == 2, na.rm = TRUE) * 100, + Treat_Pred_Income_3 = mean(income[Treat_Pred == 1] == 3, na.rm = TRUE) * 100, + Treat_Pred_Income_4 = mean(income[Treat_Pred == 1] == 4, na.rm = TRUE) * 100, + + Opt_Treatment_Income_1 = mean(income[Opt_Treatment == 1] == 1, na.rm = TRUE) * 100, + Opt_Treatment_Income_2 = mean(income[Opt_Treatment == 1] == 2, na.rm = TRUE) * 100, + Opt_Treatment_Income_3 = mean(income[Opt_Treatment == 1] == 3, na.rm = TRUE) * 100, + Opt_Treatment_Income_4 = mean(income[Opt_Treatment == 1] == 4, na.rm = TRUE) * 100, + + Control_Income_1 = mean(income[Control == 1] == 1, na.rm = TRUE) * 100, + Control_Income_2 = mean(income[Control == 1] == 2, na.rm = TRUE) * 100, + Control_Income_3 = mean(income[Control == 1] == 3, na.rm = TRUE) * 100, + Control_Income_4 = mean(income[Control == 1] == 4, na.rm = TRUE) * 100, + + # Continuous Variables (Mean) + + Full_Sample_Age = mean(Age, na.rm = TRUE), + Dummy_Treated_Age = mean(Age[Dummy_Treated == 1], na.rm = TRUE), + Dummy_Vol_Treated_Age = mean(Age[Dummy_Vol_Treated == 1], na.rm = TRUE), + Dummy_no_info_Age = mean(Age[Dummy_no_info == 1], na.rm = TRUE), + Control_Not_Pred_Age = mean(Age[Control_Not_Pred == 1], na.rm = TRUE), + Control_Pred_Age = mean(Age[Control_Pred == 1], na.rm = TRUE), + Opt_Not_Pred_Age = mean(Age[Opt_Not_Pred == 1], na.rm = TRUE), + Opt_Pred_Age = mean(Age[Opt_Pred == 1], na.rm = TRUE), + Treat_Not_Pred_Age = mean(Age[Treat_Not_Pred == 1], na.rm = TRUE), + Treat_Pred_Age = mean(Age[Treat_Pred == 1], na.rm = TRUE), + Opt_Treatment_Age = mean(Age[Opt_Treatment == 1], na.rm = TRUE), + Control_Age = mean(Age[Control == 1], na.rm = TRUE), + + Full_Sample_NR = mean(Z_Mean_NR, na.rm = TRUE), + Dummy_Treated_NR = mean(Z_Mean_NR[Dummy_Treated == 1], na.rm = TRUE), + Dummy_Vol_Treated_NR = mean(Z_Mean_NR[Dummy_Vol_Treated == 1], na.rm = TRUE), + Dummy_no_info_NR = mean(Z_Mean_NR[Dummy_no_info == 1], na.rm = TRUE), + Control_Not_Pred_NR = mean(Z_Mean_NR[Control_Not_Pred == 1], na.rm = TRUE), + Control_Pred_NR = mean(Z_Mean_NR[Control_Pred == 1], na.rm = TRUE), + Opt_Not_Pred_NR = mean(Z_Mean_NR[Opt_Not_Pred == 1], na.rm = TRUE), + Opt_Pred_NR = mean(Z_Mean_NR[Opt_Pred == 1], na.rm = TRUE), + Treat_Not_Pred_NR = mean(Z_Mean_NR[Treat_Not_Pred == 1], na.rm = TRUE), + Treat_Pred_NR = mean(Z_Mean_NR[Treat_Pred == 1], na.rm = TRUE), + Opt_Treatment_NR = mean(Z_Mean_NR[Opt_Treatment == 1], na.rm = TRUE), + Control_NR = mean(Z_Mean_NR[Control == 1], na.rm = TRUE), + + Full_Sample_Conseq = mean(Conseq_score, na.rm = TRUE), + Dummy_Treated_Conseq = mean(Conseq_score[Dummy_Treated == 1], na.rm = TRUE), + Dummy_Vol_Treated_Conseq = mean(Conseq_score[Dummy_Vol_Treated == 1], na.rm = TRUE), + Dummy_no_info_Conseq = mean(Conseq_score[Dummy_no_info == 1], na.rm = TRUE), + Control_Not_Pred_Conseq = mean(Conseq_score[Control_Not_Pred == 1], na.rm = TRUE), + Control_Pred_Conseq = mean(Conseq_score[Control_Pred == 1], na.rm = TRUE), + Opt_Not_Pred_Conseq = mean(Conseq_score[Opt_Not_Pred == 1], na.rm = TRUE), + Opt_Pred_Conseq = mean(Conseq_score[Opt_Pred == 1], na.rm = TRUE), + Treat_Not_Pred_Conseq = mean(Conseq_score[Treat_Not_Pred == 1], na.rm = TRUE), + Treat_Pred_Conseq = mean(Conseq_score[Treat_Pred == 1], na.rm = TRUE), + Opt_Treatment_Conseq = mean(Conseq_score[Opt_Treatment == 1], na.rm = TRUE), + Control_Conseq = mean(Conseq_score[Control == 1], na.rm = TRUE), + + Full_Sample_Correct = mean(percentage_correct, na.rm = TRUE), + Dummy_Treated_Correct = mean(percentage_correct[Dummy_Treated == 1], na.rm = TRUE), + Dummy_Vol_Treated_Correct = mean(percentage_correct[Dummy_Vol_Treated == 1], na.rm = TRUE), + Dummy_no_info_Correct = mean(percentage_correct[Dummy_no_info == 1], na.rm = TRUE), + Control_Not_Pred_Correct = mean(percentage_correct[Control_Not_Pred == 1], na.rm = TRUE), + Control_Pred_Correct = mean(percentage_correct[Control_Pred == 1], na.rm = TRUE), + Opt_Not_Pred_Correct = mean(percentage_correct[Opt_Not_Pred == 1], na.rm = TRUE), + Opt_Pred_Correct = mean(percentage_correct[Opt_Pred == 1], na.rm = TRUE), + Treat_Not_Pred_Correct = mean(percentage_correct[Treat_Not_Pred == 1], na.rm = TRUE), + Treat_Pred_Correct = mean(percentage_correct[Treat_Pred == 1], na.rm = TRUE), + Opt_Treatment_Correct = mean(percentage_correct[Opt_Treatment == 1], na.rm = TRUE), + Control_Correct = mean(percentage_correct[Control == 1], na.rm = TRUE), + + Full_Sample_InterviewTime = mean(interviewtime_net_clean, na.rm = TRUE), + Dummy_Treated_InterviewTime = mean(interviewtime_net_clean[Dummy_Treated == 1], na.rm = TRUE), + Dummy_Vol_Treated_InterviewTime = mean(interviewtime_net_clean[Dummy_Vol_Treated == 1], na.rm = TRUE), + Dummy_no_info_InterviewTime = mean(interviewtime_net_clean[Dummy_no_info == 1], na.rm = TRUE), + Control_Not_Pred_InterviewTime = mean(interviewtime_net_clean[Control_Not_Pred == 1], na.rm = TRUE), + Control_Pred_InterviewTime = mean(interviewtime_net_clean[Control_Pred == 1], na.rm = TRUE), + Opt_Not_Pred_InterviewTime = mean(interviewtime_net_clean[Opt_Not_Pred == 1], na.rm = TRUE), + Opt_Pred_InterviewTime = mean(interviewtime_net_clean[Opt_Pred == 1], na.rm = TRUE), + Treat_Not_Pred_InterviewTime = mean(interviewtime_net_clean[Treat_Not_Pred == 1], na.rm = TRUE), + Treat_Pred_InterviewTime = mean(interviewtime_net_clean[Treat_Pred == 1], na.rm = TRUE), + Opt_Treatment_InterviewTime = mean(interviewtime_net_clean[Opt_Treatment == 1], na.rm = TRUE), + Control_InterviewTime = mean(interviewtime_net_clean[Control == 1], na.rm = TRUE), + + + Full_Sample_CC_time = mean(CC_time_mean_clean, na.rm = TRUE), + Dummy_Treated_CC_time = mean(CC_time_mean_clean[Dummy_Treated == 1], na.rm = TRUE), + Dummy_Vol_Treated_CC_time = mean(CC_time_mean_clean[Dummy_Vol_Treated == 1], na.rm = TRUE), + Dummy_no_info_CC_time = mean(CC_time_mean_clean[Dummy_no_info == 1], na.rm = TRUE), + Control_Not_Pred_CC_time = mean(CC_time_mean_clean[Control_Not_Pred == 1], na.rm = TRUE), + Control_Pred_CC_time = mean(CC_time_mean_clean[Control_Pred == 1], na.rm = TRUE), + Opt_Not_Pred_CC_time = mean(CC_time_mean_clean[Opt_Not_Pred == 1], na.rm = TRUE), + Opt_Pred_CC_time = mean(CC_time_mean_clean[Opt_Pred == 1], na.rm = TRUE), + Treat_Not_Pred_CC_time = mean(CC_time_mean_clean[Treat_Not_Pred == 1], na.rm = TRUE), + Treat_Pred_CC_time = mean(CC_time_mean_clean[Treat_Pred == 1], na.rm = TRUE), + Opt_Treatment_CC_time = mean(CC_time_mean_clean[Opt_Treatment == 1], na.rm = TRUE), + Control_CC_time = mean(CC_time_mean_clean[Control == 1], na.rm = TRUE) + + ) + + +write.csv(summary_stats,"Data/descriptives.csv", row.names = F) +write.table(summary_stats, "Data/descriptives.txt", sep="\t") + + + +# Names of the variables in the first Column: +categories <- c("Gender", "Proportion Female:", "Age", "Net Houshold Income", + "Less than 1500€", "1500€ - 3000€", + "300€- 4000€", "More than 4000€", + "Education", "University degree", "No University degree", + "NR-Index", "Correct quiz question", "Consequentiality score", + "Net interview time", "Mean Choice Card time", "Number of respondents") + + + +# Create the data frame +table_df <- data.frame(Category = categories, + Sample = rep(NA, length(categories)), + Treated = rep(NA, length(categories)), + Opt_Treatment = rep(NA, length(categories)), + Control = rep(NA, length(categories)), + Vol_Treated = rep(NA, length(categories)), + No_Info = rep(NA, length(categories)), + Treated_Predicted = rep(NA, length(categories)), + Treated_Not_Predicted = rep(NA, length(categories)), + Opt_Treatment_Predicted = rep(NA, length(categories)), + Opt_Treatment_Not_Predicted = rep(NA, length(categories)), + Control_Predicted = rep(NA, length(categories)), + Control_Not_Predicted = rep(NA, length(categories))) + + +table_df$Sample[2:2] <- round(summary_stats$Full_Sample_Gender[1:1],2) # Ensure the counts match the categories +table_df$Sample[3:3] <- round(summary_stats$Full_Sample_Age[1:1],2) # Ensure the counts match the categories +table_df$Sample[5:5] <- round(summary_stats$Full_Sample_Income_1[1:1],2) # Ensure the counts match the categories +table_df$Sample[6:6] <- round(summary_stats$Full_Sample_Income_2[1:1],2) # Ensure the counts match the categories +table_df$Sample[7:7] <- round(summary_stats$Full_Sample_Income_3[1:1],2) # Ensure the counts match the categories +table_df$Sample[8:8] <- round(summary_stats$Full_Sample_Income_4[1:1],2) # Ensure the counts match the categories +table_df$Sample[10:10] <- round(summary_stats$Full_Sample_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Sample[11:11] <- round(100-summary_stats$Full_Sample_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Sample[12:12] <- round(summary_stats$Full_Sample_NR[1:1],2) # Ensure the counts match the categories +table_df$Sample[13:13] <- round(summary_stats$Full_Sample_Correct [1:1],2) # Ensure the counts match the categories +table_df$Sample[14:14] <- round(summary_stats$Full_Sample_Conseq [1:1],2) # Ensure the counts match the categories +table_df$Sample[15:15] <- round(summary_stats$Full_Sample_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$Sample[16:16] <- round(summary_stats$Full_Sample_CC_time[1:1],2) # Ensure the counts match the categories +table_df$Sample[17:17] <- round(summary_stats$Full_Sample_n[1:1],0) # Ensure the counts match the categories + +table_df$Treated[2:2] <- round(summary_stats$Dummy_Treated_Gender[1:1],2) # Ensure the counts match the categories +table_df$Treated[3:3] <- round(summary_stats$Dummy_Treated_Age[1:1],2) # Ensure the counts match the categories +table_df$Treated[5:5] <- round(summary_stats$Dummy_Treated_Income_1[1:1],2) # Ensure the counts match the categories +table_df$Treated[6:6] <- round(summary_stats$Dummy_Treated_Income_2[1:1],2) # Ensure the counts match the categories +table_df$Treated[7:7] <- round(summary_stats$Dummy_Treated_Income_3[1:1],2) # Ensure the counts match the categories +table_df$Treated[8:8] <- round(summary_stats$Dummy_Treated_Income_4[1:1],2) # Ensure the counts match the categories +table_df$Treated[10:10] <- round(summary_stats$Dummy_Treated_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Treated[11:11] <- round(100-summary_stats$Dummy_Treated_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Treated[12:12] <- round(summary_stats$Dummy_Treated_NR[1:1],2) # Ensure the counts match the categories +table_df$Treated[13:13] <- round(summary_stats$Dummy_Treated_Correct [1:1],2) # Ensure the counts match the categories +table_df$Treated[14:14] <- round(summary_stats$Dummy_Treated_Conseq [1:1],2) # Ensure the counts match the categories +table_df$Treated[15:15] <- round(summary_stats$Dummy_Treated_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$Treated[16:16] <- round(summary_stats$Dummy_Treated_CC_time[1:1],2) # Ensure the counts match the categories +table_df$Treated[17:17] <- round(summary_stats$Dummy_Treated_n[1:1],0) # Ensure the counts match the categories + + +table_df$Opt_Treatment[2:2] <- round(summary_stats$Opt_Treatment_Gender[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[3:3] <- round(summary_stats$Opt_Treatment_Age[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[5:5] <- round(summary_stats$Opt_Treatment_Income_1[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[6:6] <- round(summary_stats$Opt_Treatment_Income_2[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[7:7] <- round(summary_stats$Opt_Treatment_Income_3[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[8:8] <- round(summary_stats$Opt_Treatment_Income_4[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[10:10] <- round(summary_stats$Opt_Treatment_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[11:11] <- round(100-summary_stats$Opt_Treatment_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[12:12] <- round(summary_stats$Opt_Treatment_NR[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[13:13] <- round(summary_stats$Opt_Treatment_Correct [1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[14:14] <- round(summary_stats$Opt_Treatment_Conseq [1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[15:15] <- round(summary_stats$Opt_Treatment_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$Opt_Treatment[16:16] <- round(summary_stats$Opt_Treatment_CC_time[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment[17:17] <- round(summary_stats$Opt_Treatment_n[1:1],0) + + +table_df$Control[2:2] <- round(summary_stats$Control_Gender[1:1],2) # Ensure the counts match the categories +table_df$Control[3:3] <- round(summary_stats$Control_Age[1:1],2) # Ensure the counts match the categories +table_df$Control[5:5] <- round(summary_stats$Control_Income_1[1:1],2) # Ensure the counts match the categories +table_df$Control[6:6] <- round(summary_stats$Control_Income_2[1:1],2) # Ensure the counts match the categories +table_df$Control[7:7] <- round(summary_stats$Control_Income_3[1:1],2) # Ensure the counts match the categories +table_df$Control[8:8] <- round(summary_stats$Control_Income_4[1:1],2) # Ensure the counts match the categories +table_df$Control[10:10] <- round(summary_stats$Control_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Control[11:11] <- round(100-summary_stats$Control_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Control[12:12] <- round(summary_stats$Control_NR[1:1],2) # Ensure the counts match the categories +table_df$Control[13:13] <- round(summary_stats$Control_Correct [1:1],2) # Ensure the counts match the categories +table_df$Control[14:14] <- round(summary_stats$Control_Conseq [1:1],2) # Ensure the counts match the categories +table_df$Control[15:15] <- round(summary_stats$Control_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$Control[16:16] <- round(summary_stats$Control_CC_time[1:1],2) # Ensure the counts match the categories +table_df$Control[17:17] <- round(summary_stats$Control_n[1:1],0) # Ensure the counts match the categories +# Ensure the counts match the categories + + +table_df$Vol_Treated[2:2] <- round(summary_stats$Dummy_Vol_Treated_Gender[1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[3:3] <- round(summary_stats$Dummy_Vol_Treated_Age[1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[5:5] <- round(summary_stats$Dummy_Vol_Treated_Income_1[1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[6:6] <- round(summary_stats$Dummy_Vol_Treated_Income_2[1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[7:7] <- round(summary_stats$Dummy_Vol_Treated_Income_3[1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[8:8] <- round(summary_stats$Dummy_Vol_Treated_Income_4[1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[10:10] <- round(summary_stats$Dummy_Vol_Treated_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[11:11] <- round(100-summary_stats$Dummy_Vol_Treated_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[12:12] <- round(summary_stats$Dummy_Vol_Treated_NR[1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[13:13] <- round(summary_stats$Dummy_Vol_Treated_Correct [1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[14:14] <- round(summary_stats$Dummy_Vol_Treated_Conseq [1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[15:15] <- round(summary_stats$Dummy_Vol_Treated_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$Vol_Treated[16:16] <- round(summary_stats$Dummy_Vol_Treated_CC_time[1:1],2) # Ensure the counts match the categories +table_df$Vol_Treated[17:17] <- round(summary_stats$Dummy_Vol_Treated_n[1:1],0) + +table_df$No_Info[2:2] <- round(summary_stats$Dummy_no_info_Gender[1:1],2) # Ensure the counts match the categories +table_df$No_Info[3:3] <- round(summary_stats$Dummy_no_info_Age[1:1],2) # Ensure the counts match the categories +table_df$No_Info[5:5] <- round(summary_stats$Dummy_no_info_Income_1[1:1],2) # Ensure the counts match the categories +table_df$No_Info[6:6] <- round(summary_stats$Dummy_no_info_Income_2[1:1],2) # Ensure the counts match the categories +table_df$No_Info[7:7] <- round(summary_stats$Dummy_no_info_Income_3[1:1],2) # Ensure the counts match the categories +table_df$No_Info[8:8] <- round(summary_stats$Dummy_no_info_Income_4[1:1],2) # Ensure the counts match the categories +table_df$No_Info[10:10] <- round(summary_stats$Dummy_no_info_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$No_Info[11:11] <- round(100-summary_stats$Dummy_no_info_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$No_Info[12:12] <- round(summary_stats$Dummy_no_info_NR[1:1],2) # Ensure the counts match the categories +table_df$No_Info[13:13] <- round(summary_stats$Dummy_no_info_Correct [1:1],2) # Ensure the counts match the categories +table_df$No_Info[14:14] <- round(summary_stats$Dummy_no_info_Conseq [1:1],2) # Ensure the counts match the categories +table_df$No_Info[15:15] <- round(summary_stats$Dummy_no_info_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$No_Info[16:16] <- round(summary_stats$Dummy_no_info_CC_time[1:1],2) # Ensure the counts match the categories +table_df$No_Info[17:17] <- round(summary_stats$Dummy_no_info_n[1:1],0) + + +table_df$Treated_Predicted[2:2] <- round(summary_stats$Treat_Pred_Gender[1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[3:3] <- round(summary_stats$Treat_Pred_Age[1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[5:5] <- round(summary_stats$Treat_Pred_Income_1[1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[6:6] <- round(summary_stats$Treat_Pred_Income_2[1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[7:7] <- round(summary_stats$Treat_Pred_Income_3[1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[8:8] <- round(summary_stats$Treat_Pred_Income_4[1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[10:10] <- round(summary_stats$Treat_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[11:11] <- round(100-summary_stats$Treat_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[12:12] <- round(summary_stats$Treat_Pred_NR[1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[13:13] <- round(summary_stats$Treat_Pred_Correct [1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[14:14] <- round(summary_stats$Treat_Pred_Conseq [1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[15:15] <- round(summary_stats$Treat_Pred_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$Treated_Predicted[16:16] <- round(summary_stats$Treat_Pred_CC_time[1:1],2) # Ensure the counts match the categories +table_df$Treated_Predicted[17:17] <- round(summary_stats$Treat_Pred_n[1:1],0) + + +table_df$Treated_Not_Predicted[2:2] <- round(summary_stats$Treat_Not_Pred_Gender[1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[3:3] <- round(summary_stats$Treat_Not_Pred_Age[1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[5:5] <- round(summary_stats$Treat_Not_Pred_Income_1[1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[6:6] <- round(summary_stats$Treat_Not_Pred_Income_2[1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[7:7] <- round(summary_stats$Treat_Not_Pred_Income_3[1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[8:8] <- round(summary_stats$Treat_Not_Pred_Income_4[1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[10:10] <- round(summary_stats$Treat_Not_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[11:11] <- round(100-summary_stats$Treat_Not_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[12:12] <- round(summary_stats$Treat_Not_Pred_NR[1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[13:13] <- round(summary_stats$Treat_Not_Pred_Correct [1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[14:14] <- round(summary_stats$Treat_Not_Pred_Conseq [1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[15:15] <- round(summary_stats$Treat_Not_Pred_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[16:16] <- round(summary_stats$Treat_Not_Pred_CC_time[1:1],2) # Ensure the counts match the categories +table_df$Treated_Not_Predicted[17:17] <- round(summary_stats$Treat_Not_Pred_n[1:1],0) + + + +table_df$Opt_Treatment_Predicted[2:2] <- round(summary_stats$Opt_Pred_Gender[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[3:3] <- round(summary_stats$Opt_Pred_Age[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[5:5] <- round(summary_stats$Opt_Pred_Income_1[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[6:6] <- round(summary_stats$Opt_Pred_Income_2[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[7:7] <- round(summary_stats$Opt_Pred_Income_3[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[8:8] <- round(summary_stats$Opt_Pred_Income_4[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[10:10] <- round(summary_stats$Opt_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[11:11] <- round(100-summary_stats$Opt_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[12:12] <- round(summary_stats$Opt_Pred_NR[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[13:13] <- round(summary_stats$Opt_Pred_Correct [1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[14:14] <- round(summary_stats$Opt_Pred_Conseq [1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[15:15] <- round(summary_stats$Opt_Pred_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[16:16] <- round(summary_stats$Opt_Pred_CC_time[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Predicted[17:17] <- round(summary_stats$Opt_Pred_n[1:1],0) + +table_df$Opt_Treatment_Not_Predicted[2:2] <- round(summary_stats$Opt_Not_Pred_Gender[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[3:3] <- round(summary_stats$Opt_Not_Pred_Age[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[5:5] <- round(summary_stats$Opt_Not_Pred_Income_1[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[6:6] <- round(summary_stats$Opt_Not_Pred_Income_2[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[7:7] <- round(summary_stats$Opt_Not_Pred_Income_3[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[8:8] <- round(summary_stats$Opt_Not_Pred_Income_4[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[10:10] <- round(summary_stats$Opt_Not_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[11:11] <- round(100-summary_stats$Opt_Not_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[12:12] <- round(summary_stats$Opt_Not_Pred_NR[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[13:13] <- round(summary_stats$Opt_Not_Pred_Correct [1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[14:14] <- round(summary_stats$Opt_Not_Pred_Conseq [1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[15:15] <- round(summary_stats$Opt_Not_Pred_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[16:16] <- round(summary_stats$Opt_Not_Pred_CC_time[1:1],2) # Ensure the counts match the categories +table_df$Opt_Treatment_Not_Predicted[17:17] <- round(summary_stats$Opt_Not_Pred_n[1:1],0) + + + +table_df$Control_Predicted[2:2] <- round(summary_stats$Control_Pred_Gender[1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[3:3] <- round(summary_stats$Control_Pred_Age[1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[5:5] <- round(summary_stats$Control_Pred_Income_1[1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[6:6] <- round(summary_stats$Control_Pred_Income_2[1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[7:7] <- round(summary_stats$Control_Pred_Income_3[1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[8:8] <- round(summary_stats$Control_Pred_Income_4[1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[10:10] <- round(summary_stats$Control_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[11:11] <- round(100-summary_stats$Control_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[12:12] <- round(summary_stats$Control_Pred_NR[1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[13:13] <- round(summary_stats$Control_Pred_Correct [1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[14:14] <- round(summary_stats$Control_Pred_Conseq [1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[15:15] <- round(summary_stats$Control_Pred_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$Control_Predicted[16:16] <- round(summary_stats$Control_Pred_CC_time[1:1],2) # Ensure the counts match the categories +table_df$Control_Predicted[17:17] <- round(summary_stats$Control_Pred_n[1:1],0) + +table_df$Control_Not_Predicted[2:2] <- round(summary_stats$Control_Not_Pred_Gender[1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[3:3] <- round(summary_stats$Control_Not_Pred_Age[1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[5:5] <- round(summary_stats$Control_Not_Pred_Income_1[1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[6:6] <- round(summary_stats$Control_Not_Pred_Income_2[1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[7:7] <- round(summary_stats$Control_Not_Pred_Income_3[1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[8:8] <- round(summary_stats$Control_Not_Pred_Income_4[1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[10:10] <- round(summary_stats$Control_Not_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[11:11] <- round(100-summary_stats$Control_Not_Pred_UniDegree[1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[12:12] <- round(summary_stats$Control_Not_Pred_NR[1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[13:13] <- round(summary_stats$Control_Not_Pred_Correct [1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[14:14] <- round(summary_stats$Control_Not_Pred_Conseq [1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[15:15] <- round(summary_stats$Control_Not_Pred_InterviewTime[1:1],0) # Ensure the counts match the categories +table_df$Control_Not_Predicted[16:16] <- round(summary_stats$Control_Not_Pred_CC_time[1:1],2) # Ensure the counts match the categories +table_df$Control_Not_Predicted[17:17] <- round(summary_stats$Control_Not_Pred_n[1:1],0) + +write.table(table_df, "Data/descriptives_long.txt", sep="\t") +xtable(table_df, include.rownames = F) diff --git a/Scripts/lapage_results.R b/Scripts/lapage_results.R new file mode 100644 index 0000000..fa2e855 --- /dev/null +++ b/Scripts/lapage_results.R @@ -0,0 +1,210 @@ +#******************************************************************** +### LEPAGE TEST FUNCTIONS (FROM NSM3 PACKAGE) + +lepage.test <- function(x, y = NA, g = NA, method = NA, n.mc = 10000){ + + ##Adapted from kruskal.test()## + if (is.list(x)) { + if (length(x) < 2L) + stop("'x' must be a list with at least 2 elements") + y <- x[[2]] + x <- x[[1]] + } + else { + if(min(is.na(y)) != 0){ + k <- length(unique(g)) + if (length(x) != length(g)) + stop("'x' and 'g' must have the same length") + if (k < 2) + stop("all observations are in the same group") + y <- x[g == 2] + x <- x[g == 1] + } + } + ##################### + + outp <- list() + outp$m <- m <- length(x) + outp$n <- n <- length(y) + outp$n.mc <- n.mc + N <- outp$m + outp$n + outp$ties <- (length(c(x, y)) != length(unique(c(x, y)))) + even <- (outp$m + outp$n + 1)%%2 + outp$stat.name <- "Lepage D" + + + ##When the user doesn't give us any indication of which method to use, try to pick one. + if(is.na(method)){ + if(choose(outp$m + outp$n, outp$n) <= 10000){ + method <- "Exact" + } + if(choose(outp$m + outp$n, outp$n) > 10000){ + method <- "Monte Carlo" + } + } + ##################################################################### + outp$method <- method + + tmp.W <- rank(c(x, y)) + + our.data <- rbind(c(x, y), c(rep(1, length(x)), rep(0, length(y)))) + sorted <- our.data[1, order(our.data[1, ]) ] + x.labels <-our.data[2, order(our.data[1, ]) ] + + med <- ceiling(N / 2) + if(even){no.ties <- c(1:med, med:1)} + if(!even){no.ties <- c(1:med, (med - 1):1)} + + obs.group <- numeric(N) + group.num <- 1 + for(i in 1:N){ + if(obs.group[i] == 0){ + obs.group[i] <- group.num + for(j in i:N){ + if(sorted[i] == sorted[j]){ + obs.group[j] <- obs.group[i] + } + } + group.num <- group.num + 1; + } + } + + group.ranks <- tapply(no.ties, obs.group, mean) + + tied.ranks <- numeric(N) + for(i in 1:group.num){ + tied.ranks[which(obs.group == as.numeric(names(group.ranks)[i]))] <- group.ranks[i] + } + + tmp.C <- c(tied.ranks[x.labels == 1], tied.ranks[x.labels == 0]) + + ##Only needs to depend on y values + D.calc <- function(C.vals, W.vals){ + + if(even){ + exp.C <- n * (N + 2) / 4 + var.C <- m * n * (N + 2) * (N - 2) / (48 * (N - 1)) + } + if(!even){ + exp.C <- n * (N + 1)^2 / (4 * N) + var.C <- m * n * (N + 1) * (3 + N^2) / (48 * N^2) + } + W.obs <- sum(W.vals) + W.star <- (W.obs - n * (N + 1) / 2) / sqrt(m * n * (N + 1) / 12) + C.star <- (sum(C.vals) - exp.C) / sqrt(var.C) + return(W.star^2 + C.star^2) + } + + outp$obs.stat <- D.calc(tmp.C[(m + 1):N], tmp.W[(m + 1):N]) + + if(outp$method == "Exact"){ + possible.orders <- gtools::combinations(outp$m + outp$n, outp$n) + + possible.C <- t(apply(possible.orders, 1, function(x) tmp.C[x])) + possible.W <- t(apply(possible.orders, 1, function(x) tmp.W[x])) + + theor.dist <- numeric(nrow(possible.C)) + for(i in 1:nrow(possible.C)){ + theor.dist[i] <- D.calc(possible.C[i, ], possible.W[i, ]) + } + + outp$p.value <- mean(theor.dist >= outp$obs.stat) + } + + if(outp$method == "Asymptotic"){ + outp$p.value <- (1 - pchisq(outp$obs.stat, 2)) + } + + if(outp$method == "Monte Carlo"){ + outp$p.value <- 0 + for(i in 1:n.mc){ + mc.sample <- sample(1:N, n) + + if(D.calc(tmp.C[mc.sample], tmp.W[mc.sample]) >= outp$obs.stat){ + outp$p.value = outp$p.value + 1 / n.mc + } + } + } + + cat("\nLepage two-sample location-scale test\n") + cat("\nNull hypothesis: The locations and scales of the two population distributions are equal.\n") + cat("Alternative hypothesis: The locations and/or scales of the two population distributions differ.\n") + cat(paste("\nD = ", round(outp$obs.stat, 3), ", p-value = ", round(outp$p.value, 4), "\n\n", sep="")) + + #class(outp)="NSM3Ch5p" + return(outp) +} +# Example usage +set.seed(123) +x <- rnorm(244, mean = 38.63, sd = 26.68) +y <- rnorm(240, mean = 31.87, sd = 17.81) +x <- rnorm(300, mean = 1, sd = 1) +y <- rnorm(300, mean = 0.963, sd = 0.827) +lepage.test(x, y) + + +set.seed(123) # Ensure reproducibility + +n_sim <- 500 +values <- data.frame(p=double(n_sim), D=double(n_sim) ) + +for (i in 1:n_sim) { + # Sample means and standard deviations from their respective distributions + mean_x <- rnorm(1, mean = 38.63, sd = 7.05) + mean_y <- rnorm(1, mean = 31.87, sd = 5.12) + + sd_x <- rnorm(1, mean = 26.68, sd = 5.48) + sd_y <- rnorm(1, mean = 22.38, sd = 3.63) + + # Ensure standard deviations are positive + sd_x <- max(sd_x, 0.01) + sd_y <- max(sd_y, 0.01) + + # Generate data + x <- rnorm(244, mean = mean_x, sd = sd_x) + y <- rnorm(240, mean = mean_y, sd = sd_y) + + # Perform Lepage test + test <- lepage.test(x, y) + values$p[i] <- test$p.value + values$D[i] <- test$obs.stat + +} + +# Calculate the proportion of p-values < 0.05 +prop_reject <- mean(values$p < 0.10) + + +# Output result +print(prop_reject) + +set.seed(123) # Ensure reproducibility + +n_sim <- 500 +values_fixed <- data.frame(p=double(n_sim), D=double(n_sim) ) +var_coef_x=26.68/38.63 +var_coef_y=22.38/31.87 +for (i in 1:n_sim) { + # Sample means and standard deviations from their respective distributions + mean_x <- rnorm(1, mean = 38.63, sd = 7.05) + mean_y <- rnorm(1, mean = 31.87, sd = 5.12) + + sd_x <- abs(mean_x*var_coef_x) + sd_y <- abs(mean_y*var_coef_y) + # Generate data + x <- rnorm(244, mean = mean_x, sd = sd_x) + y <- rnorm(240, mean = mean_y, sd = sd_y) + + # Perform Lepage test + test <- invisible(lepage.test(x, y)) + values_fixed$p[i] <- test$p.value + values_fixed$D[i] <- test$obs.stat + +} + +# Calculate the proportion of p-values < 0.05 +prop_reject <- mean(values_fixed$p < 0.10) + +# Output result +print(prop_reject) +hist(values_fixed$D) diff --git a/Scripts/lapage_results_naturalness.R b/Scripts/lapage_results_naturalness.R new file mode 100644 index 0000000..8cd1475 --- /dev/null +++ b/Scripts/lapage_results_naturalness.R @@ -0,0 +1,372 @@ +set.seed(123) # Ensure reproducibility + +lepage.test <- function(x, y = NA, g = NA, method = NA, n.mc = 10000){ + + ##Adapted from kruskal.test()## + if (is.list(x)) { + if (length(x) < 2L) + stop("'x' must be a list with at least 2 elements") + y <- x[[2]] + x <- x[[1]] + } + else { + if(min(is.na(y)) != 0){ + k <- length(unique(g)) + if (length(x) != length(g)) + stop("'x' and 'g' must have the same length") + if (k < 2) + stop("all observations are in the same group") + y <- x[g == 2] + x <- x[g == 1] + } + } + ##################### + + outp <- list() + outp$m <- m <- length(x) + outp$n <- n <- length(y) + outp$n.mc <- n.mc + N <- outp$m + outp$n + outp$ties <- (length(c(x, y)) != length(unique(c(x, y)))) + even <- (outp$m + outp$n + 1)%%2 + outp$stat.name <- "Lepage D" + + + ##When the user doesn't give us any indication of which method to use, try to pick one. + if(is.na(method)){ + if(choose(outp$m + outp$n, outp$n) <= 10000){ + method <- "Exact" + } + if(choose(outp$m + outp$n, outp$n) > 10000){ + method <- "Monte Carlo" + } + } + ##################################################################### + outp$method <- method + + tmp.W <- rank(c(x, y)) + + our.data <- rbind(c(x, y), c(rep(1, length(x)), rep(0, length(y)))) + sorted <- our.data[1, order(our.data[1, ]) ] + x.labels <-our.data[2, order(our.data[1, ]) ] + + med <- ceiling(N / 2) + if(even){no.ties <- c(1:med, med:1)} + if(!even){no.ties <- c(1:med, (med - 1):1)} + + obs.group <- numeric(N) + group.num <- 1 + for(i in 1:N){ + if(obs.group[i] == 0){ + obs.group[i] <- group.num + for(j in i:N){ + if(sorted[i] == sorted[j]){ + obs.group[j] <- obs.group[i] + } + } + group.num <- group.num + 1; + } + } + + group.ranks <- tapply(no.ties, obs.group, mean) + + tied.ranks <- numeric(N) + for(i in 1:group.num){ + tied.ranks[which(obs.group == as.numeric(names(group.ranks)[i]))] <- group.ranks[i] + } + + tmp.C <- c(tied.ranks[x.labels == 1], tied.ranks[x.labels == 0]) + + ##Only needs to depend on y values + D.calc <- function(C.vals, W.vals){ + + if(even){ + exp.C <- n * (N + 2) / 4 + var.C <- m * n * (N + 2) * (N - 2) / (48 * (N - 1)) + } + if(!even){ + exp.C <- n * (N + 1)^2 / (4 * N) + var.C <- m * n * (N + 1) * (3 + N^2) / (48 * N^2) + } + W.obs <- sum(W.vals) + W.star <- (W.obs - n * (N + 1) / 2) / sqrt(m * n * (N + 1) / 12) + C.star <- (sum(C.vals) - exp.C) / sqrt(var.C) + return(W.star^2 + C.star^2) + } + + outp$obs.stat <- D.calc(tmp.C[(m + 1):N], tmp.W[(m + 1):N]) + + if(outp$method == "Exact"){ + possible.orders <- gtools::combinations(outp$m + outp$n, outp$n) + + possible.C <- t(apply(possible.orders, 1, function(x) tmp.C[x])) + possible.W <- t(apply(possible.orders, 1, function(x) tmp.W[x])) + + theor.dist <- numeric(nrow(possible.C)) + for(i in 1:nrow(possible.C)){ + theor.dist[i] <- D.calc(possible.C[i, ], possible.W[i, ]) + } + + outp$p.value <- mean(theor.dist >= outp$obs.stat) + } + + if(outp$method == "Asymptotic"){ + outp$p.value <- (1 - pchisq(outp$obs.stat, 2)) + } + + if(outp$method == "Monte Carlo"){ + outp$p.value <- 0 + for(i in 1:n.mc){ + mc.sample <- sample(1:N, n) + + if(D.calc(tmp.C[mc.sample], tmp.W[mc.sample]) >= outp$obs.stat){ + outp$p.value = outp$p.value + 1 / n.mc + } + } + } + + cat("\nLepage two-sample location-scale test\n") + cat("\nNull hypothesis: The locations and scales of the two population distributions are equal.\n") + cat("Alternative hypothesis: The locations and/or scales of the two population distributions differ.\n") + cat(paste("\nD = ", round(outp$obs.stat, 3), ", p-value = ", round(outp$p.value, 4), "\n\n", sep="")) + + #class(outp)="NSM3Ch5p" + return(outp) +} +lapage <- function(mu_x, se_mu_x, sigma_x, se_sigma_x, n_x, + mu_y, se_mu_y, sigma_y, se_sigma_y, n_y, + model_x, model_y, n_sim) { + + values <- data.frame(p = double(n_sim), D = double(n_sim)) + + for (i in 1:n_sim) { + # Sample means and standard deviations from their respective distributions + mean_x <- rnorm(1, mean = mu_x, sd = se_mu_x) + mean_y <- rnorm(1, mean = mu_y, sd = se_mu_y) + + sd_x <- rnorm(1, mean = sigma_x, sd = se_sigma_x) + sd_y <- rnorm(1, mean = sigma_y, sd = se_sigma_y) + + # Ensure standard deviations are positive + sd_x <- max(sd_x, 0.01) + sd_y <- max(sd_y, 0.01) + + # Generate data + x <- rnorm(n_x, mean = mean_x, sd = sd_x) + y <- rnorm(n_y, mean = mean_y, sd = sd_y) + + # Perform Lepage test + test <- lepage.test(x, y) + values$p[i] <- test$p.value + values$D[i] <- test$obs.stat + } + + # Calculate the proportion of p-values < 0.05 + prop_reject <- mean(values$p < 0.05) + + # Print the result + cat(paste0(model_x, " vs ", model_y, "\n")) + cat(paste0("Reject in ", prop_reject * 100, "%\n")) + + return(values) +} + + + +# Load models +MXL_wtp_Control_Not_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Control_Not_Pred") +MXL_wtp_Control_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Control_Pred") +MXL_wtp_Opt_Not_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Opt_Not_Pred") +MXL_wtp_Opt_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Opt_Pred") +MXL_wtp_Treated_Not_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Treated_Not_Pred") +MXL_wtp_Treated_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Treated_Pred") + +MXL_wtp_opt_treated <- apollo_loadModel("Estimation_results/mxl/Split_samples/MXL_wtp Vol_Treat") +MXL_wtp_opt_not_treated <- apollo_loadModel("Estimation_results/mxl/Split_samples/MXL_wtp Vol_Not_Treat") + +mxl_tr <- apollo_loadModel("Estimation_results/mxl/Split_samples/MXL_wtp Treated A") +mxl_vol_tr <- apollo_loadModel("Estimation_results/mxl/Split_samples/MXL_wtp Vol_Treated A") +mxl_not_tr <- apollo_loadModel("Estimation_results/mxl/Split_samples/MXL_wtp Not_Treated A") + +# List of models +models <- list( + "Treated Not Pred" = MXL_wtp_Treated_Not_Pred_model, + "Opt Not Pred" = MXL_wtp_Opt_Not_Pred_model, + "Control Not Pred" = MXL_wtp_Control_Not_Pred_model, + "Treated Pred" = MXL_wtp_Treated_Pred_model, + "Opt Pred" = MXL_wtp_Opt_Pred_model, + "Control Pred" = MXL_wtp_Control_Pred_model, + "Treated Split Sample" = mxl_tr, + "Opt Split Sample" = mxl_vol_tr, + "Control Split Sample" = mxl_not_tr +) + +# Initialize an empty list to store results +model_data_list <- list() + +# Loop through the models and extract coefficients +for (model_name in names(models)) { + model <- models[[model_name]] + + # Extract mean and standard deviation for 'mu_natural' and 'sig_natural' + coef <- model$estimate + se <- model$se + + temp_data <- data.frame( + model = model_name, + mu = coef["mu_natural"], + sigma = coef["sig_natural"], + se_mu = se["mu_natural"], + se_sigma = se["sig_natural"], + n = model$nIndivs + ) + + # Store in list + model_data_list[[model_name]] <- temp_data +} + +# Combine all data into a single data frame +model_data <- bind_rows(model_data_list) + +# Define the pairs of models for comparison +model_pairs <- list( + c("Treated Split Sample", "Control Split Sample"), + c("Treated Split Sample", "Opt Split Sample"), + c("Opt Split Sample", "Control Split Sample"), + c("Treated Not Pred", "Treated Pred"), + c("Opt Not Pred", "Opt Pred"), + c("Control Not Pred" , "Control Pred" ), + c("Treated Not Pred", "Control Not Pred"), + c("Opt Not Pred", "Control Not Pred"), + c("Treated Not Pred", "Opt Not Pred"), + c("Treated Pred", "Control Pred"), + c("Opt Pred", "Control Pred"), + c("Treated Pred", "Opt Pred") +) + +# Initialize an empty list to store results +results_list <- list() + +# Number of simulations +n_sim <- 1000 # Adjust as needed + +# Loop over model pairs +for (pair in model_pairs) { + model_x <- pair[1] + model_y <- pair[2] + + # Extract relevant data + x_data <- model_data %>% filter(model == model_x) + y_data <- model_data %>% filter(model == model_y) + + # Run the Lapage function + result <- lapage( + x_data$mu, x_data$se_mu, x_data$sigma, x_data$se_sigma, x_data$n, + y_data$mu, y_data$se_mu, y_data$sigma, y_data$se_sigma, y_data$n, + model_x, model_y, n_sim + ) + + # Store results + results_list[[paste(model_x, "vs", model_y)]] <- data.frame( + model_x = model_x, + model_y = model_y, + prop_reject = mean(result$p < 0.05) * 100.00 + ) +} + +# Combine results into a single data frame +final_results <- bind_rows(results_list) + +# Print or save results +print(final_results) + + + + + + + + + + + +# Initialize an empty list to store results +model_data_list <- list() + +# Loop through the models and extract coefficients +for (model_name in names(models)) { + model <- models[[model_name]] + + # Extract mean and standard deviation for 'mu_natural' and 'sig_natural' + coef <- model$estimate + se <- model$se + + temp_data <- data.frame( + model = model_name, + mu = coef["mu_walking"], + sigma = coef["sig_walking"], + se_mu = se["mu_walking"], + se_sigma = se["sig_walking"], + n = model$nIndivs + ) + + # Store in list + model_data_list[[model_name]] <- temp_data +} + +# Combine all data into a single data frame +model_data <- bind_rows(model_data_list) + +# Define the pairs of models for comparison +model_pairs <- list( + c("Treated Split Sample", "Control Split Sample"), + c("Treated Split Sample", "Opt Split Sample"), + c("Opt Split Sample", "Control Split Sample"), + c("Treated Not Pred", "Treated Pred"), + c("Opt Not Pred", "Opt Pred"), + c("Control Not Pred" , "Control Pred" ), + c("Treated Not Pred", "Control Not Pred"), + c("Opt Not Pred", "Control Not Pred"), + c("Treated Not Pred", "Opt Not Pred"), + c("Treated Pred", "Control Pred"), + c("Opt Pred", "Control Pred"), + c("Treated Pred", "Opt Pred") +) + +# Initialize an empty list to store results +results_list <- list() + +# Number of simulations +n_sim <- 1000 # Adjust as needed + +# Loop over model pairs +for (pair in model_pairs) { + model_x <- pair[1] + model_y <- pair[2] + + # Extract relevant data + x_data <- model_data %>% filter(model == model_x) + y_data <- model_data %>% filter(model == model_y) + + # Run the Lapage function + result <- lapage( + x_data$mu, x_data$se_mu, x_data$sigma, x_data$se_sigma, x_data$n, + y_data$mu, y_data$se_mu, y_data$sigma, y_data$se_sigma, y_data$n, + model_x, model_y, n_sim + ) + + # Store results + results_list[[paste(model_x, "vs", model_y)]] <- data.frame( + model_x = model_x, + model_y = model_y, + prop_reject = mean(result$p < 0.05) * 100.00 + ) +} + +# Combine results into a single data frame +final_results_wd <- bind_rows(results_list) + +# Print or save results +print(final_results_wd) +saveRDS(final_results, "Data/Lapage_Nat.RDS") +saveRDS(final_results_wd, "Data/Lapage_Nat.RDS") diff --git a/Scripts/logit/chr_vol_treat.R b/Scripts/logit/chr_vol_treat.R index caff63e..bfe28f7 100644 --- a/Scripts/logit/chr_vol_treat.R +++ b/Scripts/logit/chr_vol_treat.R @@ -1,186 +1,188 @@ -library(dplyr) -library(tidyr) -library(margins) -library(lubridate) -library(caret) -library(randomForest) -library(pROC) - -library(xgboost) -# Test treatment effect - -database_full <- database_full %>% - filter(!is.na(Treatment_new)) %>% - mutate(Dummy_Video_1 = case_when(Treatment_new == 1 ~ 1, TRUE ~ 0), - Dummy_Video_2 = case_when(Treatment_new == 5 ~ 1, TRUE ~ 0), - Dummy_no_info = case_when(Treatment_new == 3 ~ 1, TRUE~0), - Dummy_Info_nv1 = case_when(Treatment_new == 2 ~1, TRUE~0), - Dummy_Info_nv2 = case_when(Treatment_new == 4 ~1 , TRUE~0)) %>% - mutate(Engagement_ugs = groupTime1731 + groupTime1726 + groupTime1769 + groupTime1727 + groupTime1770 + groupTime1771 + groupTime1768 + - groupTime1738) - - - - -data <- database_full %>% - group_by(id) %>% - dplyr::slice(1) %>% - ungroup() -data <- data %>% - mutate(Choice_Treat = ifelse( Dummy_Video_2 == 1 | Dummy_Info_nv2 == 1, 1, - ifelse(Dummy_no_info==1 ,0,NA))) - - - -table(data$Choice_Treat) - - - -logit_choice_treat<-glm(Choice_Treat ~ as.factor(Gender)+Z_Mean_NR+Age_mean + QFIncome + - as.factor(Education), data, family=binomial) -summary(logit_choice_treat) - - -logit_choice_treat_uni<-glm(Choice_Treat ~ as.factor(Gender)+Z_Mean_NR+Age_mean + QFIncome + - Uni_degree + Kids_Dummy + Engagement_ugs + UGS_visits, data, family=binomial) -summary(logit_choice_treat_uni) - - -logit_sig <- glm(Choice_Treat ~ Z_Mean_NR + Age_mean + QFIncome, data, family=binomial) - -# Calculate marginal effects -marginal_effects <- margins(logit_choice_treat_uni) - -# Display the marginal effects -summary(marginal_effects) - - -cut_off <- 0.45 - - -# Make predictions -data$probabilities <- logit_sig %>% predict(data, type = "response") -data$predicted.classes <- ifelse(data$probabilities >= cut_off, 1, 0) -# Model accuracy -mean(data$predicted.classes == data$Choice_Treat, na.rm = T) - - - - - - - -# Assuming your data is in a data frame called 'data' -set.seed(123) # For reproducibility -data<-select(data, Choice_Treat, id, Age, Q02W123, Uni_degree, QFIncome,Z_Mean_NR, City, Screen, - Uni_degree, Q02W123,Q04W123,Q05W123, Q08W123,Q09W123,Q10W123,Q11W3,Q12W123,Q13W23,Q14S01W123,Q14S02W23, - Q14S03W123,Q14S04W123,Q14S05W123,Q14S06W23,Q14S07W3,Q14S08W2,Q14W23, - Q15S01W3,Q15S02W3,Q16W3,Q17W13,Q18W123,Q19W3,C02W23,Q20W23,Q21W23, - Q22S01W123,Q22S02W23,UGS_visits,Q24S01W123, - Q24S02W123,Q24S03W123,Q24S04W23,Q24S05W123,Q25W23,Q26S01W123,Q26S02W123, - Q26S03W23,Q26S04W123,Q26S05W123,Q26S06W123,Q26S07W23,Q26S08W23,Q26S99W23, - Q27W123,Q30W23,Q31S01W23,Q31S02W23,Q31S03W23,Q31S04W23, - Q31S05W23,Q31S06W23,Q31S08W23,Q32S01W23,Q32S02W23,Q32S03W23, - Q33S01W3,Q33S02W3,Q33S03W3,Q33S04W3,Q33S05W3,Q33S06W3,Q33S07W3,Q33S99W3, - Q34W23,Q35W23,Q36W23,Q37W23,Q38S01W3,Q38S02W3,Q38S03W3,Q38S04W3,Q38S05W3, - Q38S06W3,Q38S07W3,Q38S08W3,Q38S09W3,Q38S10W3,Q38S11W3,Q38S12W3,Q38S13W3, - Q38S14W3,Q38S15W3,Q38S16W3,Q38S17W3,Q38S18W3,Q38S19W3,Q38S20W3,Q38S21W3, - V01W3,Q39S01W3,Q39S02W3,Q39S03W3,Q39S04W3,Q39S05W3,Q40W3 - ,Q45W123,Q46S01W3,Q46S02W3,Q46S03W3,Q46S04W3, - Q47S01W23,Q47S02W3,Q47S03W3,Q47S04W3,Q47S05W3,Q47S06W3,Q47S07W3,Q47S08W3, - Q47S09W3,Q48S01W3,Q48S02W3,Q48S03W3,Q48S04W3,Q48S05W3,Q49W23,Q50W123) -data <- data %>% - select(where(~ all(!is.na(.)) & all(. != "")) | any_of("Choice_Treat")) -# Split the data into labeled and unlabeled sets -labeled_data <- filter(data, Choice_Treat==1| Choice_Treat==0) -unlabeled_data <- filter(data, is.na(Choice_Treat)) -labeled_data_id<-labeled_data -labeled_data<-select(labeled_data,-id) -# Assuming the group information is in the column called 'Group' -labeled_data$Choice_Treat<- as.factor(labeled_data$Choice_Treat) -unlabeled_data$Choice_Treat<- as.factor(unlabeled_data$Choice_Treat) - -trainIndex <- createDataPartition(labeled_data$Choice_Treat, p = 0.8, list = FALSE) -trainData <- labeled_data[trainIndex, ] -testData <- labeled_data[-trainIndex, ] - - - - - - -# tuneGrid <- expand.grid( -# nrounds = c(300), -# max_depth = c(15), -# eta = c(0.05), -# gamma = c(0, 0.1), -# colsample_bytree = c(0.6, 0.8), # Fraction of features to be used for each tree -# min_child_weight = c(1, 3, 5), # Minimum sum of instance weight (hessian) needed in a child -# subsample = c(0.7, 0.8) # Fraction of samples to be used for each tree -# ) -# -# -# model3 <- train(Choice_Treat ~ ., -# data = trainData, -# method = "xgbTree", -# tuneGrid = tuneGrid, -# trControl = trainControl(method = "cv", number = 5)) -# -# -# # Get variable importance -# varImp(model3) -# -# -# -# # Evaluate the model -# predictions <- predict(model3, newdata = testData) -# confusionMatrix(predictions, testData$Choice_Treat) -# labeled_predictions <- predict(model3, newdata = labeled_data) -# labeled_data$PredictedGroup <- labeled_predictions -# -# table(labeled_data$Choice_Treat, labeled_data$PredictedGroup) -# -# unlabeled_predictions <- predict(model3, newdata = unlabeled_data) -# labeled_data_id$PredictedGroup <- labeled_predictions -# data_prediction_labeled<-select(labeled_data_id, c("id", "PredictedGroup")) -# saveRDS(data_prediction_labeled, "Data/predictions_labeled.RDS") -# -# unlabeled_data$PredictedGroup <- unlabeled_predictions -# data_prediction<-select(unlabeled_data, c("id", "PredictedGroup")) -# saveRDS(data_prediction, "Data/predictions.RDS") -# -# print(model3$bestTune) -# -# -# table(data$predicted.classes ,data$Choice_Treat) -# -# calculate_metrics <- function(confusion_matrix) { -# # Extract values from the confusion matrix -# TN <- confusion_matrix[1, 1] -# FP <- confusion_matrix[2, 1] -# FN <- confusion_matrix[1, 2] -# TP <- confusion_matrix[2, 2] -# -# # Calculate sensitivity -# sensitivity <- round(TP / (TP + FN), 2) -# -# # Calculate specificity -# specificity <- round(TN / (TN + FP), 2) -# -# # Return the results as a list -# return(list(sensitivity = sensitivity, specificity = specificity)) -# } -# -# # Calculate metrics -# metrics <- calculate_metrics(table(data$predicted.classes ,data$Choice_Treat)) -# -# # Print results -# print(paste("Sensitivity:", metrics$sensitivity)) -# print(paste("Specificity:", metrics$specificity)) -# roc_obj <- roc(data$Choice_Treat, data$probabilities) -# plot(roc_obj, main = "ROC Curve", col = "blue", lwd = 2) -# auc_value <- auc(roc_obj) -# best_coords <- coords(roc_obj, "best", best.method="youden") -# -# +library(dplyr) +library(tidyr) +library(margins) +library(lubridate) +library(caret) +library(randomForest) +library(pROC) + +library(xgboost) +# Test treatment effect + +database_full <- database_full %>% + filter(!is.na(Treatment_new)) %>% + mutate(Dummy_Video_1 = case_when(Treatment_new == 1 ~ 1, TRUE ~ 0), + Dummy_Video_2 = case_when(Treatment_new == 5 ~ 1, TRUE ~ 0), + Dummy_no_info = case_when(Treatment_new == 3 ~ 1, TRUE~0), + Dummy_Info_nv1 = case_when(Treatment_new == 2 ~1, TRUE~0), + Dummy_Info_nv2 = case_when(Treatment_new == 4 ~1 , TRUE~0)) %>% + mutate(Engagement_ugs = groupTime1731 + groupTime1726 + groupTime1769 + groupTime1727 + groupTime1770 + groupTime1771 + groupTime1768 + + groupTime1738) + + + + +data <- database_full %>% + group_by(id) %>% + dplyr::slice(1) %>% + ungroup() +data <- data %>% + mutate(Choice_Treat = ifelse( Dummy_Video_2 == 1 | Dummy_Info_nv2 == 1, 1, + ifelse(Dummy_no_info==1 ,0,NA))) + + + +table(data$Choice_Treat) + + + +logit_choice_treat<-glm(Choice_Treat ~ as.factor(Gender)+Z_Mean_NR+Age_mean + QFIncome + + as.factor(Education), data, family=binomial) +summary(logit_choice_treat) + + +logit_choice_treat_uni<-glm(Choice_Treat ~ as.factor(Gender)+Z_Mean_NR+Age_mean + QFIncome + + Uni_degree + Kids_Dummy + Engagement_ugs + UGS_visits, data, family=binomial) +summary(logit_choice_treat_uni) + + +logit_sig <- glm(Choice_Treat ~ Z_Mean_NR + Uni_degree+as.factor(Gender) + Age_mean + QFIncome, data, family=binomial) +summary(logit_sig) +1 - (logLik(logit_sig) / logLik(update(logit_sig, . ~ 1))) + +# Calculate marginal effects +marginal_effects <- margins(logit_choice_treat_uni) + +# Display the marginal effects +summary(marginal_effects) + + +cut_off <- 0.45 + + +# Make predictions +data$probabilities <- logit_sig %>% predict(data, type = "response") +data$predicted.classes <- ifelse(data$probabilities >= cut_off, 1, 0) +# Model accuracy +mean(data$predicted.classes == data$Choice_Treat, na.rm = T) + + + + + + + +# Assuming your data is in a data frame called 'data' +set.seed(123) # For reproducibility +data<-select(data, Choice_Treat, id, Age, Q02W123, Uni_degree, QFIncome,Z_Mean_NR, City, Screen, + Uni_degree, Q02W123,Q04W123,Q05W123, Q08W123,Q09W123,Q10W123,Q11W3,Q12W123,Q13W23,Q14S01W123,Q14S02W23, + Q14S03W123,Q14S04W123,Q14S05W123,Q14S06W23,Q14S07W3,Q14S08W2,Q14W23, + Q15S01W3,Q15S02W3,Q16W3,Q17W13,Q18W123,Q19W3,C02W23,Q20W23,Q21W23, + Q22S01W123,Q22S02W23,UGS_visits,Q24S01W123, + Q24S02W123,Q24S03W123,Q24S04W23,Q24S05W123,Q25W23,Q26S01W123,Q26S02W123, + Q26S03W23,Q26S04W123,Q26S05W123,Q26S06W123,Q26S07W23,Q26S08W23,Q26S99W23, + Q27W123,Q30W23,Q31S01W23,Q31S02W23,Q31S03W23,Q31S04W23, + Q31S05W23,Q31S06W23,Q31S08W23,Q32S01W23,Q32S02W23,Q32S03W23, + Q33S01W3,Q33S02W3,Q33S03W3,Q33S04W3,Q33S05W3,Q33S06W3,Q33S07W3,Q33S99W3, + Q34W23,Q35W23,Q36W23,Q37W23,Q38S01W3,Q38S02W3,Q38S03W3,Q38S04W3,Q38S05W3, + Q38S06W3,Q38S07W3,Q38S08W3,Q38S09W3,Q38S10W3,Q38S11W3,Q38S12W3,Q38S13W3, + Q38S14W3,Q38S15W3,Q38S16W3,Q38S17W3,Q38S18W3,Q38S19W3,Q38S20W3,Q38S21W3, + V01W3,Q39S01W3,Q39S02W3,Q39S03W3,Q39S04W3,Q39S05W3,Q40W3 + ,Q45W123,Q46S01W3,Q46S02W3,Q46S03W3,Q46S04W3, + Q47S01W23,Q47S02W3,Q47S03W3,Q47S04W3,Q47S05W3,Q47S06W3,Q47S07W3,Q47S08W3, + Q47S09W3,Q48S01W3,Q48S02W3,Q48S03W3,Q48S04W3,Q48S05W3,Q49W23,Q50W123) +data <- data %>% + select(where(~ all(!is.na(.)) & all(. != "")) | any_of("Choice_Treat")) +# Split the data into labeled and unlabeled sets +labeled_data <- filter(data, Choice_Treat==1| Choice_Treat==0) +unlabeled_data <- filter(data, is.na(Choice_Treat)) +labeled_data_id<-labeled_data +labeled_data<-select(labeled_data,-id) +# Assuming the group information is in the column called 'Group' +labeled_data$Choice_Treat<- as.factor(labeled_data$Choice_Treat) +unlabeled_data$Choice_Treat<- as.factor(unlabeled_data$Choice_Treat) + +trainIndex <- createDataPartition(labeled_data$Choice_Treat, p = 0.8, list = FALSE) +trainData <- labeled_data[trainIndex, ] +testData <- labeled_data[-trainIndex, ] + + + + + + + tuneGrid <- expand.grid( + nrounds = c(300), + max_depth = c(15), + eta = c(0.05), + gamma = c(0, 0.1), + colsample_bytree = c(0.6, 0.8), # Fraction of features to be used for each tree + min_child_weight = c(1, 3, 5), # Minimum sum of instance weight (hessian) needed in a child + subsample = c(0.7, 0.8) # Fraction of samples to be used for each tree + ) + + + model3 <- train(Choice_Treat ~ ., + data = trainData, + method = "xgbTree", + tuneGrid = tuneGrid, + trControl = trainControl(method = "cv", number = 5)) + + + # Get variable importance + varImp(model3) + + + + # Evaluate the model + predictions <- predict(model3, newdata = testData) + confusionMatrix(predictions, testData$Choice_Treat) + labeled_predictions <- predict(model3, newdata = labeled_data) + labeled_data$PredictedGroup <- labeled_predictions + + table(labeled_data$Choice_Treat, labeled_data$PredictedGroup) + + unlabeled_predictions <- predict(model3, newdata = unlabeled_data) + labeled_data_id$PredictedGroup <- labeled_predictions + data_prediction_labeled<-select(labeled_data_id, c("id", "PredictedGroup")) + saveRDS(data_prediction_labeled, "Data/predictions_labeled.RDS") + + unlabeled_data$PredictedGroup <- unlabeled_predictions + data_prediction<-select(unlabeled_data, c("id", "PredictedGroup")) + saveRDS(data_prediction, "Data/predictions.RDS") + + print(model3$bestTune) + + +# table(data$predicted.classes ,data$Choice_Treat) +# +# calculate_metrics <- function(confusion_matrix) { +# # Extract values from the confusion matrix +# TN <- confusion_matrix[1, 1] +# FP <- confusion_matrix[2, 1] +# FN <- confusion_matrix[1, 2] +# TP <- confusion_matrix[2, 2] +# +# # Calculate sensitivity +# sensitivity <- round(TP / (TP + FN), 2) +# +# # Calculate specificity +# specificity <- round(TN / (TN + FP), 2) +# +# # Return the results as a list +# return(list(sensitivity = sensitivity, specificity = specificity)) +# } +# +# # Calculate metrics +# metrics <- calculate_metrics(table(data$predicted.classes ,data$Choice_Treat)) +# +# # Print results +# print(paste("Sensitivity:", metrics$sensitivity)) +# print(paste("Specificity:", metrics$specificity)) +# roc_obj <- roc(data$Choice_Treat, data$probabilities) +# plot(roc_obj, main = "ROC Curve", col = "blue", lwd = 2) +# auc_value <- auc(roc_obj) +# best_coords <- coords(roc_obj, "best", best.method="youden") +# +# # cut_off <- best_coords$threshold \ No newline at end of file diff --git a/Scripts/ols/ols_consequentiality.R b/Scripts/ols/ols_consequentiality.R index 981a901..d7ab761 100644 --- a/Scripts/ols/ols_consequentiality.R +++ b/Scripts/ols/ols_consequentiality.R @@ -34,6 +34,11 @@ summary(conseq_model_D) conseq_model_control_D <- lm(Conseq_score ~ as.factor(Treatment_D) + Z_Mean_NR + as.factor(Gender)+Age_mean+ QFIncome + Uni_degree,data) summary(conseq_model_control_D) +cons_data <- left_join(data, matching_group, by="id") + +conseq_model_control_G <- lm(Conseq_score ~ as.factor(Matching_Group) + Z_Mean_NR + as.factor(Gender)+Age_mean+ QFIncome + Uni_degree,cons_data) +summary(conseq_model_control_G) + # conseq_model_control_G <- lm(Conseq_score ~ as.factor(Matching_Group) + Z_Mean_NR + as.factor(Gender)+Age_mean+ QFIncome + Uni_degree,data) # summary(conseq_model_control_G) diff --git a/Scripts/ols/ols_time_spent.R b/Scripts/ols/ols_time_spent.R index fe6de07..bb00c30 100644 --- a/Scripts/ols/ols_time_spent.R +++ b/Scripts/ols/ols_time_spent.R @@ -19,8 +19,8 @@ database <- database %>% Dummy_Treated == 1 & PredictedGroup == 0 ~"Treat Not Pred", Treatment_A == "Vol_Treated" & PredictedGroup == 1 ~ "Opt Pred", Treatment_A == "Vol_Treated" & PredictedGroup == 0 ~ "Opt Not Pred", - Treatment_new == 6 & PredictedGroup == 0 ~ "Control Pred", - TRUE ~ "Control Not Pred")) + Treatment_new == 6 & PredictedGroup == 0 ~ "Control Not Pred", + TRUE ~ "Control Pred")) data <- database %>% group_by(id) %>% diff --git a/Scripts/visualize_distr_spli.R b/Scripts/visualize_distr_spli.R index 9fc1bdf..0c152b8 100644 --- a/Scripts/visualize_distr_spli.R +++ b/Scripts/visualize_distr_spli.R @@ -16,16 +16,16 @@ mxl_not_tr <- apollo_loadModel("Estimation_results/mxl/Split_samples/MXL_wtp Not # List of models models <- list( "Treated Not Pred" = MXL_wtp_Treated_Not_Pred_model, - "Control Not Pred" = MXL_wtp_Control_Not_Pred_model, - "Control Pred" = MXL_wtp_Control_Pred_model, "Opt Not Pred" = MXL_wtp_Opt_Not_Pred_model, + "Control Not Pred" = MXL_wtp_Control_Not_Pred_model, + "Treated Pred" = MXL_wtp_Treated_Pred_model, "Opt Pred" = MXL_wtp_Opt_Pred_model, + "Control Pred" = MXL_wtp_Control_Pred_model, "Opt No Info" = MXL_wtp_opt_not_treated, "Opt Info" = MXL_wtp_opt_treated, - "Treated Pred" = MXL_wtp_Treated_Pred_model, - "Control Split Sample" = mxl_not_tr, + "Treated Split Sample" = mxl_tr, "Opt Split Sample" = mxl_vol_tr, - "Treated Split Sample" = mxl_tr + "Control Split Sample" = mxl_not_tr ) group_color_mapping <- c( @@ -118,27 +118,6 @@ ggplot(plot_data, aes(x = x, y = density, color = Group, linetype = Prediction)) theme(legend.position = "right") -#### Z-test if distributions are different - -z_value <- (MXL_wtp_Treated_Pred_model$estimate["mu_natural"] - MXL_wtp_Control_Pred_model$estimate["mu_natural"])/ - (sqrt((MXL_wtp_Treated_Pred_model$estimate["sig_natural"]/sqrt(MXL_wtp_Treated_Pred_model$nObs)) + - (MXL_wtp_Control_Pred_model$estimate["sig_natural"]/sqrt(MXL_wtp_Control_Pred_model$nObs)))) - - -# Function to calculate z-value using your exact formula -calculate_z_value <- function(model_1, model_2) { - # Calculate the z-value using your specific formula - z_value <- (model_1$estimate["mu_natural"] - model_2$estimate["mu_natural"]) / - (sqrt((model_1$robse["mu_natural"]^2 + model_2$robse["mu_natural"]^2))) - - return(z_value) -} - -# Example of using the function -z_value <- calculate_z_value(MXL_wtp_opt_treated, MXL_wtp_opt_not_treated) - -# Print the z-value -z_value #### Plot underneath @@ -267,12 +246,12 @@ split_sample_models <- models[names(models) %in% c("Treated Split Sample", "Opt # Function to perform Z-tests within a group while ensuring the correct order - perform_z_tests_within_group <- function(model_group, group_name) { + perform_z_tests_within_group <- function(model_group, group_name, coef="mu_natural") { model_names <- names(model_group) z_test_results <- data.frame() # Define the correct comparison order - comparison_order <- c("Treated", "Optional", "Control") # Order: Treated vs Control, Treated vs Optional, Optional vs Control + comparison_order <- c("Treated", "Opt", "Control") # Order: Treated vs Control, Treated vs Optional, Optional vs Control # Reorder the models according to the desired comparison order ordered_models <- model_group[order(match(sub(" .*", "", model_names), comparison_order))] @@ -281,7 +260,7 @@ split_sample_models <- models[names(models) %in% c("Treated Split Sample", "Opt for (i in seq_along(ordered_models)) { for (j in seq_along(ordered_models)) { if (i < j) { # Avoid duplicate comparisons - test_result <- calculate_z_test(ordered_models[[i]], ordered_models[[j]], coef="mu_natural") + test_result <- calculate_z_test(ordered_models[[i]], ordered_models[[j]], coef=coef) test_result$Comparison <- paste(names(ordered_models)[i], "vs", names(ordered_models)[j]) # Add model pair label test_result$Group <- group_name # Label the group z_test_results <- bind_rows(z_test_results, test_result) @@ -292,9 +271,9 @@ split_sample_models <- models[names(models) %in% c("Treated Split Sample", "Opt } # Function to perform cross-group tests while ensuring "Predicted vs Not Predicted" - perform_cross_group_tests <- function(not_predicted, predicted) { + perform_cross_group_tests <- function(not_predicted, predicted, coef="mu_natural") { z_test_results <- data.frame() - comparison_order <- c("Treated", "Optional", "Control") # Ensure correct order + comparison_order <- c("Treated", "Optional", "Control") # Ensure correct order category_map <- list( "Treated Pred" = "Treated Not Pred", @@ -310,7 +289,7 @@ split_sample_models <- models[names(models) %in% c("Treated Split Sample", "Opt cat_not_pred <- category_map[[cat_pred]] if (cat_pred %in% names(ordered_predicted) & cat_not_pred %in% names(ordered_not_predicted)) { - test_result <- calculate_z_test(ordered_predicted[[cat_pred]], ordered_not_predicted[[cat_not_pred]], coef="mu_natural") + test_result <- calculate_z_test(ordered_predicted[[cat_pred]], ordered_not_predicted[[cat_not_pred]], coef=coef) test_result$Comparison <- paste(cat_pred, "vs", cat_not_pred) # Now "Predicted vs Not Predicted" test_result$Group <- "Predicted vs Not Predicted" z_test_results <- bind_rows(z_test_results, test_result) @@ -321,13 +300,20 @@ split_sample_models <- models[names(models) %in% c("Treated Split Sample", "Opt } # Perform Z-tests within each group while enforcing the correct order - z_tests_not_predicted <- perform_z_tests_within_group(not_predicted_models, "Not Predicted") - z_tests_predicted <- perform_z_tests_within_group(predicted_models, "Predicted") - z_tests_split_sample <- perform_z_tests_within_group(split_sample_models, "Split Sample") + coefficients_to_test <- c("mu_natural", "mu_walking", "mu_rent", "ASC_1", "ASC_2") + #coefficients_to_test <- c("sig_natural", "sig_walking", "sig_rent", "sig_ASC_1", "sig_ASC_2") # test sds + - # Perform cross-group tests - z_tests_cross_group <- perform_cross_group_tests(not_predicted_models, predicted_models) + ##### For all + # Within groups + z_tests_not_predicted <- perform_z_tests_within_group(not_predicted_models, "Not Predicted", coefficients_to_test) + z_tests_predicted <- perform_z_tests_within_group(predicted_models, "Predicted", coefficients_to_test) + z_tests_split_sample <- perform_z_tests_within_group(split_sample_models, "Split Sample", coefficients_to_test) + # Cross-group + z_tests_cross_group <- perform_cross_group_tests(not_predicted_models, predicted_models, coefficients_to_test) + +################ Do Plots ######### # Combine all Z-test results z_test_results_all <- bind_rows( z_tests_not_predicted, @@ -347,69 +333,170 @@ split_sample_models <- models[names(models) %in% c("Treated Split Sample", "Opt z_test_results_splits <- z_test_results_all %>% filter(Group %in% c("Split Sample")) z_test_results_cross <- z_test_results_all %>% filter(Group == "Predicted vs Not Predicted") - ### **Visualization 1: Within-Group Comparisons (Ordered by Group)** ### - ggplot(z_test_results_within, aes(x = reorder(Comparison, Group), y = Mean_Difference, color = Significant)) + + + + + + + z_test_results_all <- z_test_results_all %>% + mutate( + Coefficient = sub("\\.+\\d+", "", rownames(z_test_results_all)), # Clean coefficient names + Significant = ifelse(P_Value < 0.05, "Statistically significant difference", "No statistically significant difference") + ) + + + ggplot(z_test_results_all, aes(x = reorder(Comparison, Mean_Difference), y = Mean_Difference, color = Significant)) + geom_point(size = 3) + geom_errorbar(aes(ymin = Lower_CI, ymax = Upper_CI), width = 0.2) + - scale_color_manual(values = c("Significant" = "red", "Not Significant" = "blue")) + - geom_hline(yintercept = 0, linetype = "dashed", color = "black") + # Line at zero - coord_flip() + # Flip for readability + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") + + scale_color_manual(values = c("Statistically significant difference" = "red", "No statistically significant difference" = "grey")) + + coord_flip() + + facet_grid(Group ~ Coefficient, scales = "free", space = "free_y") + labs( - #title = "Within-Group Z-Test Results for Mean Differences", x = "Model Comparisons", - y = "Mean Difference (Naturalness)", + y = "Mean Difference", color = "Significance" ) + - facet_wrap(~ Group, scales = "free_y", ncol = 1) + # Separate groups visually - theme_minimal() + - theme(legend.position = "right") + theme_bw() + + theme( + strip.text = element_text(face = "bold"), + legend.position = "bottom", + panel.spacing = unit(0.8, "lines") + ) - ggsave("Figures/z_test_predictions_between.png", width=8, height=5, dpi="print") +#### Only ASC and rent for appendix + z_test_results_appendix <- z_test_results_all %>% + filter(!Coefficient %in% c("mu_natural", "mu_walking")) - #### Only split samples + ggplot(z_test_results_appendix, aes(x = factor(Comparison, level=c("Treated Not Pred vs Opt Not Pred", "Opt Not Pred vs Control Not Pred", + "Treated Not Pred vs Control Not Pred", "Treated Pred vs Opt Pred", + "Opt Pred vs Control Pred", "Treated Pred vs Control Pred", + "Treated Split Sample vs Opt Split Sample", "Opt Split Sample vs Control Split Sample", + "Treated Split Sample vs Control Split Sample", "Control Pred vs Control Not Pred", + "Opt Pred vs Opt Not Pred","Treated Pred vs Treated Not Pred" + )), y = Mean_Difference, color = Significant)) + + geom_point(size = 3) + + geom_errorbar(aes(ymin = Lower_CI, ymax = Upper_CI), width = 0.2) + + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") + + scale_color_manual(values = c("Statistically significant difference" = "red", "No statistically significant difference" = "grey")) + + coord_flip() + + facet_grid(Group ~ Coefficient, scales = "free", space = "free_y") + + labs( + x = "Model Comparisons", + y = "Mean Difference", + color = "Significance" + ) + + theme_bw() + + theme( + strip.text = element_text(face = "bold"), + legend.position = "bottom", + panel.spacing = unit(0.8, "lines") + ) + + + ggsave("Figures/z_test_appendix.png", width=12, height=10, dpi="print") + + z_test_results_split_nat_wd <- z_test_results_all %>% + filter(Coefficient %in% c("mu_natural", "mu_walking")) %>% filter(Group=="Split Sample") %>% mutate(Comparison = case_when( + grepl("Treated Split Sample vs Control Split Sample", Comparison) ~ "Treated vs Control", + grepl("Opt Split Sample vs Control Split Sample", Comparison) ~ "Optional vs Control", + grepl("Treated Split Sample vs Opt Split Sample", Comparison) ~ "Treated vs Optional", + TRUE ~ Comparison # Keep original if not matched + )) + + + ggplot(z_test_results_split_nat_wd, aes(x = factor(Comparison, level=c("Treated vs Optional", "Optional vs Control","Treated vs Control")), y = Mean_Difference, color = Significant)) + + geom_point(size = 3) + + geom_errorbar(aes(ymin = Lower_CI, ymax = Upper_CI), width = 0.2) + + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") + + scale_color_manual(values = c("Statistically significant difference" = "red", "No statistically significant difference" = "grey")) + + coord_flip() + + facet_grid(Group ~ Coefficient, scales = "free", space = "free_y", + labeller = labeller( + Coefficient = c( + "mu_natural" = "Naturalness", + "mu_walking" = "Walking Distance"))) + + labs( + x = "", + y = "Mean Difference", + color = "Significance" + ) + + theme_bw() + + theme( + strip.text = element_text(face = "bold"), + legend.position = "bottom", + panel.spacing = unit(0.8, "lines") + ) + + ggsave("Figures/z_test_split_samples.png", width=8, height=5, dpi="print") - z_test_results_splits <- z_test_results_splits %>% - mutate(Comparison = case_when( + + z_test_results_pred_nat_wd <- z_test_results_all %>% + filter(Coefficient %in% c("mu_natural", "mu_walking")) %>% filter(Group %in% c("Predicted", "Not Predicted")) %>% mutate(Comparison = case_when( grepl("Treated Split Sample vs Control Split Sample", Comparison) ~ "Treated vs Control", grepl("Treated Split Sample vs Opt Split Sample", Comparison) ~ "Treated vs Optional", grepl("Control Split Sample vs Opt Split Sample", Comparison) ~ "Control vs Optional", TRUE ~ Comparison # Keep original if not matched )) - ggplot(z_test_results_splits, aes(x = reorder(Comparison, Group), y = Mean_Difference, color = Significant)) + + ggplot(z_test_results_pred_nat_wd, aes(x = factor(Comparison, level=c("Treated Not Pred vs Opt Not Pred", "Opt Not Pred vs Control Not Pred", + "Treated Not Pred vs Control Not Pred", "Treated Pred vs Opt Pred", + "Opt Pred vs Control Pred", "Treated Pred vs Control Pred")), y = Mean_Difference, color = Significant)) + geom_point(size = 3) + geom_errorbar(aes(ymin = Lower_CI, ymax = Upper_CI), width = 0.2) + - scale_color_manual(values = c("Significant" = "red", "Not Significant" = "blue")) + - geom_hline(yintercept = 0, linetype = "dashed", color = "black") + # Line at zero - coord_flip() + # Flip for readability + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") + + scale_color_manual(values = c("Statistically significant difference" = "red", "No statistically significant difference" = "grey")) + + coord_flip() + + facet_grid(Group ~ Coefficient, scales = "free", space = "free_y", + labeller = labeller( + Coefficient = c( + "mu_natural" = "Naturalness", + "mu_walking" = "Walking Distance"))) + labs( - # title = "Within-Group Z-Test Results for Mean Differences", - x = "Model Comparisons", - y = "Mean Difference (Naturalness)", + x = "", + y = "Mean Difference", color = "Significance" ) + - theme_minimal() + - theme(legend.position = "right") + theme_bw() + + theme( + strip.text = element_text(face = "bold"), + legend.position = "bottom", + panel.spacing = unit(0.8, "lines") + ) - ggsave("Figures/z_test_split_samples.png", width=8, height=5, dpi="print") + ggsave("Figures/z_test_predictions_between.png", width=9, height=5, dpi="print") - ### **Visualization 2: Predicted vs Not Predicted Comparisons** ### - ggplot(z_test_results_cross, aes(x = Comparison, y = Mean_Difference, color = Significant)) + + z_test_results_pred_vs_nat_wd <- z_test_results_all %>% + filter(Coefficient %in% c("mu_natural", "mu_walking")) %>% filter(Group %in% c("Predicted vs Not Predicted")) %>% mutate(Comparison = case_when( + grepl("Treated Split Sample vs Control Split Sample", Comparison) ~ "Treated vs Control", + grepl("Treated Split Sample vs Opt Split Sample", Comparison) ~ "Treated vs Optional", + grepl("Control Split Sample vs Opt Split Sample", Comparison) ~ "Control vs Optional", + TRUE ~ Comparison # Keep original if not matched + )) + + ggplot(z_test_results_pred_vs_nat_wd, aes(x = factor(Comparison, level=c("Control Pred vs Control Not Pred", "Opt Pred vs Opt Not Pred","Treated Pred vs Treated Not Pred")), y = Mean_Difference, color = Significant)) + geom_point(size = 3) + geom_errorbar(aes(ymin = Lower_CI, ymax = Upper_CI), width = 0.2) + - scale_color_manual(values = c("Significant" = "red", "Not Significant" = "blue")) + - geom_hline(yintercept = 0, linetype = "dashed", color = "black") + # Line at zero - coord_flip() + # Flip for readability + geom_hline(yintercept = 0, linetype = "dashed", color = "gray50") + + scale_color_manual(values = c("Statistically significant difference" = "red", "No statistically significant difference" = "grey")) + + coord_flip() + + facet_grid(Group ~ Coefficient, scales = "free", space = "free_y", + labeller = labeller( + Coefficient = c( + "mu_natural" = "Naturalness", + "mu_walking" = "Walking Distance"))) + labs( - #title = "Predicted vs Not Predicted (Same Group) Z-Test Results", - x = "Model Comparisons", - y = "Mean Difference (Naturalness)", + x = "", + y = "Mean Difference", color = "Significance" ) + - theme_minimal() + - theme(legend.position = "right") + theme_bw() + + theme( + strip.text = element_text(face = "bold"), + legend.position = "bottom", + panel.spacing = unit(0.8, "lines") + ) ggsave("Figures/z_test_predictions.png", width=8, height=5, dpi="print") - \ No newline at end of file -- GitLab