ols matching

004a1e65 · nc71qaxa · 74712a44 · 004a1e65 · 004a1e65 · 004a1e65
Commit 004a1e65 authored 9 months ago by nc71qaxa
--- a/.write-test-746fb7da-de5d-497a-baa2-d528cb118be7
+++ b/.write-test-746fb7da-de5d-497a-baa2-d528cb118be7
--- a/Scripts/logit/chr_vol_treat.R
+++ b/Scripts/logit/chr_vol_treat.R
@@ -109,78 +109,78 @@ testData <- labeled_data[-trainIndex, ]



-tuneGrid <- expand.grid(
-  nrounds = c(300),
-  max_depth = c(15),
-  eta = c(0.05),
-  gamma = c(0, 0.1),
-  colsample_bytree = c(0.6, 0.8),     # Fraction of features to be used for each tree
-  min_child_weight = c(1, 3, 5),      # Minimum sum of instance weight (hessian) needed in a child
-  subsample = c(0.7, 0.8)             # Fraction of samples to be used for each tree
-)
-
-
-model3 <- train(Choice_Treat ~ ., 
-                data = trainData, 
-                method = "xgbTree", 
-                tuneGrid = tuneGrid,
-                trControl = trainControl(method = "cv", number = 5))
-
-
-# Get variable importance
-varImp(model3)
-
-
-
-# Evaluate the model
-predictions <- predict(model3, newdata = testData)
-confusionMatrix(predictions, testData$Choice_Treat)
-labeled_predictions <- predict(model3, newdata = labeled_data)
-labeled_data$PredictedGroup <- labeled_predictions
-
-table(labeled_data$Choice_Treat, labeled_data$PredictedGroup)
-
-unlabeled_predictions <- predict(model3, newdata = unlabeled_data)
-labeled_data_id$PredictedGroup <- labeled_predictions
-data_prediction_labeled<-select(labeled_data_id, c("id", "PredictedGroup"))
-saveRDS(data_prediction_labeled, "Data/predictions_labeled.RDS")
-
-unlabeled_data$PredictedGroup <- unlabeled_predictions
-data_prediction<-select(unlabeled_data, c("id", "PredictedGroup"))
-saveRDS(data_prediction, "Data/predictions.RDS")
-
-print(model3$bestTune)
-
-
-table(data$predicted.classes ,data$Choice_Treat)
-
-calculate_metrics <- function(confusion_matrix) {
-  # Extract values from the confusion matrix
-  TN <- confusion_matrix[1, 1]
-  FP <- confusion_matrix[2, 1]
-  FN <- confusion_matrix[1, 2]
-  TP <- confusion_matrix[2, 2]
-  
-  # Calculate sensitivity
-  sensitivity <- round(TP / (TP + FN), 2)
-  
-  # Calculate specificity
-  specificity <- round(TN / (TN + FP), 2)
-  
-  # Return the results as a list
-  return(list(sensitivity = sensitivity, specificity = specificity))
-}
-
-# Calculate metrics
-metrics <- calculate_metrics(table(data$predicted.classes ,data$Choice_Treat))
-
-# Print results
-print(paste("Sensitivity:", metrics$sensitivity))
-print(paste("Specificity:", metrics$specificity))
-roc_obj <- roc(data$Choice_Treat, data$probabilities)
-plot(roc_obj, main = "ROC Curve", col = "blue", lwd = 2)
-auc_value <- auc(roc_obj)
-best_coords <- coords(roc_obj, "best", best.method="youden")
-
-
-cut_off <- best_coords$threshold
\ No newline at end of file
+# tuneGrid <- expand.grid(
+#   nrounds = c(300),
+#   max_depth = c(15),
+#   eta = c(0.05),
+#   gamma = c(0, 0.1),
+#   colsample_bytree = c(0.6, 0.8),     # Fraction of features to be used for each tree
+#   min_child_weight = c(1, 3, 5),      # Minimum sum of instance weight (hessian) needed in a child
+#   subsample = c(0.7, 0.8)             # Fraction of samples to be used for each tree
+# )
+# 
+# 
+# model3 <- train(Choice_Treat ~ ., 
+#                 data = trainData, 
+#                 method = "xgbTree", 
+#                 tuneGrid = tuneGrid,
+#                 trControl = trainControl(method = "cv", number = 5))
+# 
+# 
+# # Get variable importance
+# varImp(model3)
+# 
+# 
+# 
+# # Evaluate the model
+# predictions <- predict(model3, newdata = testData)
+# confusionMatrix(predictions, testData$Choice_Treat)
+# labeled_predictions <- predict(model3, newdata = labeled_data)
+# labeled_data$PredictedGroup <- labeled_predictions
+# 
+# table(labeled_data$Choice_Treat, labeled_data$PredictedGroup)
+# 
+# unlabeled_predictions <- predict(model3, newdata = unlabeled_data)
+# labeled_data_id$PredictedGroup <- labeled_predictions
+# data_prediction_labeled<-select(labeled_data_id, c("id", "PredictedGroup"))
+# saveRDS(data_prediction_labeled, "Data/predictions_labeled.RDS")
+# 
+# unlabeled_data$PredictedGroup <- unlabeled_predictions
+# data_prediction<-select(unlabeled_data, c("id", "PredictedGroup"))
+# saveRDS(data_prediction, "Data/predictions.RDS")
+# 
+# print(model3$bestTune)
+# 
+# 
+# table(data$predicted.classes ,data$Choice_Treat)
+# 
+# calculate_metrics <- function(confusion_matrix) {
+#   # Extract values from the confusion matrix
+#   TN <- confusion_matrix[1, 1]
+#   FP <- confusion_matrix[2, 1]
+#   FN <- confusion_matrix[1, 2]
+#   TP <- confusion_matrix[2, 2]
+#   
+#   # Calculate sensitivity
+#   sensitivity <- round(TP / (TP + FN), 2)
+#   
+#   # Calculate specificity
+#   specificity <- round(TN / (TN + FP), 2)
+#   
+#   # Return the results as a list
+#   return(list(sensitivity = sensitivity, specificity = specificity))
+# }
+# 
+# # Calculate metrics
+# metrics <- calculate_metrics(table(data$predicted.classes ,data$Choice_Treat))
+# 
+# # Print results
+# print(paste("Sensitivity:", metrics$sensitivity))
+# print(paste("Specificity:", metrics$specificity))
+# roc_obj <- roc(data$Choice_Treat, data$probabilities)
+# plot(roc_obj, main = "ROC Curve", col = "blue", lwd = 2)
+# auc_value <- auc(roc_obj)
+# best_coords <- coords(roc_obj, "best", best.method="youden")
+# 
+# 
+# cut_off <- best_coords$threshold
\ No newline at end of file
--- a/Scripts/logit/protesters.R
+++ b/Scripts/logit/protesters.R
@@ -16,7 +16,7 @@ database_full <- database_full %>%

 data <- database_full %>%
  group_by(id) %>%
-  slice(1) %>%
+  dplyr::slice(1) %>%
  ungroup()

 data <- data %>% 

--- a/Scripts/mxl/Prediction models/Split_samples/mxl_wtp_space_caseM_Control_Not_Pred.R
+++ b/Scripts/mxl/Prediction models/Split_samples/mxl_wtp_space_caseM_Control_Not_Pred.R
@@ -142,7 +142,7 @@ apollo_probabilities=function(apollo_beta, apollo_inputs, functionality="estimat
 model = apollo_estimate(apollo_beta, apollo_fixed,
                        apollo_probabilities, apollo_inputs, 
                        estimate_settings=list(maxIterations=400,
-                                               estimationRoutine="bfgs",
+                                               estimationRoutine="bgw",
                                               hessianRoutine="analytic"))



--- a/Scripts/mxl/Prediction models/Split_samples/mxl_wtp_space_caseM_Opt_Not_Pred.R
+++ b/Scripts/mxl/Prediction models/Split_samples/mxl_wtp_space_caseM_Opt_Not_Pred.R
@@ -142,7 +142,7 @@ apollo_probabilities=function(apollo_beta, apollo_inputs, functionality="estimat
 model = apollo_estimate(apollo_beta, apollo_fixed,
                        apollo_probabilities, apollo_inputs, 
                        estimate_settings=list(maxIterations=400,
-                                               estimationRoutine="bfgs",
+                                               estimationRoutine="bgw",
                                               hessianRoutine="analytic"))



--- a/Scripts/mxl/Prediction models/Split_samples/mxl_wtp_space_caseM_Opt_Pred.R
+++ b/Scripts/mxl/Prediction models/Split_samples/mxl_wtp_space_caseM_Opt_Pred.R
@@ -142,7 +142,7 @@ apollo_probabilities=function(apollo_beta, apollo_inputs, functionality="estimat
 model = apollo_estimate(apollo_beta, apollo_fixed,
                        apollo_probabilities, apollo_inputs, 
                        estimate_settings=list(maxIterations=400,
-                                               estimationRoutine="bfgs",
+                                               estimationRoutine="bgw",
                                               hessianRoutine="analytic"))



--- a/Scripts/mxl/Prediction models/Split_samples/mxl_wtp_space_caseM_Treatment_Not_Pred.R
+++ b/Scripts/mxl/Prediction models/Split_samples/mxl_wtp_space_caseM_Treatment_Not_Pred.R
@@ -142,7 +142,7 @@ apollo_probabilities=function(apollo_beta, apollo_inputs, functionality="estimat
 model = apollo_estimate(apollo_beta, apollo_fixed,
                        apollo_probabilities, apollo_inputs, 
                        estimate_settings=list(maxIterations=400,
-                                               estimationRoutine="bfgs",
+                                               estimationRoutine="bgw",
                                               hessianRoutine="analytic"))



--- a/Scripts/mxl/Prediction models/Split_samples/mxl_wtp_space_caseM_Treatment_Pred.R
+++ b/Scripts/mxl/Prediction models/Split_samples/mxl_wtp_space_caseM_Treatment_Pred.R
@@ -142,7 +142,7 @@ apollo_probabilities=function(apollo_beta, apollo_inputs, functionality="estimat
 model = apollo_estimate(apollo_beta, apollo_fixed,
                                   apollo_probabilities, apollo_inputs, 
                                   estimate_settings=list(maxIterations=400,
-                                                          estimationRoutine="bfgs",
+                                                          estimationRoutine="bgw",
                                                          hessianRoutine="analytic"))



--- a/Scripts/ols/ols_consequentiality.R
+++ b/Scripts/ols/ols_consequentiality.R
@@ -33,3 +33,7 @@ summary(conseq_model_D)

 conseq_model_control_D <- lm(Conseq_score ~ as.factor(Treatment_D) + Z_Mean_NR + as.factor(Gender)+Age_mean+ QFIncome + Uni_degree,data)
 summary(conseq_model_control_D)
+
+
+conseq_model_control_G <- lm(Conseq_score ~ as.factor(Matching_Group) + Z_Mean_NR + as.factor(Gender)+Age_mean+ QFIncome + Uni_degree,data)
+summary(conseq_model_control_G)
--- a/Scripts/ols/ols_opt_out.R
+++ b/Scripts/ols/ols_opt_out.R
 data <- database_full %>%
  group_by(id) %>%
-  slice(1) %>%
+  dplyr::slice(1) %>%
  ungroup()

 data$Treatment_C <- as.factor(data$Treatment_C)

--- a/Scripts/ols/ols_quiz.R
+++ b/Scripts/ols/ols_quiz.R
+matching_group <- data %>% dplyr::select(id, Matching_Group)
+
+quiz_data <- left_join(quiz_data, matching_group, by="id")
+

 quiz_data$Treatment_C <- as.factor(quiz_data$Treatment_C)
 quiz_data$Treatment_D <- as.factor(quiz_data$Treatment_D)
@@ -27,6 +31,9 @@ ols_percentage_correct_D<- lm( percentage_correct ~ as.factor(Treatment_D) ,quiz
 summary(ols_percentage_correct_D)
 ols_percentage_correct_control_D<- lm( percentage_correct ~ as.factor(Treatment_D) + Z_Mean_NR + as.factor(Gender)+Age_mean+ QFIncome + Uni_degree,quiz_data)
 summary(ols_percentage_correct_control_D)
+
+ols_percentage_correct_control_G<- lm( percentage_correct ~ as.factor(Matching_Group) + Z_Mean_NR + as.factor(Gender)+Age_mean+ QFIncome + Uni_degree,quiz_data)
+summary(ols_percentage_correct_control_G)
 # 
 # # Create an HTML results table with customized names and stars
 # results_table_3 <- stargazer(

--- a/Scripts/ols/ols_time_spent.R
+++ b/Scripts/ols/ols_time_spent.R
 library(car)

-data <- database_full %>%
+
+data_predictions1 <- readRDS("Data/predictions.RDS")
+data_predictions2 <- readRDS("Data/predictions_labeled.RDS")
+
+data_predictions <- bind_rows(data_predictions1, data_predictions2)
+
+database <- left_join(database_full, data_predictions, by="id")
+
+
+
+database <- database %>% 
+  filter(!is.na(Treatment_new)) %>%
+  mutate(Dummy_Treated = case_when(Treatment_new == 1|Treatment_new == 2  ~ 1, TRUE ~ 0),
+         Dummy_Vol_Treated = case_when(Treatment_new == 5 |Treatment_new == 4 ~ 1, TRUE ~ 0),
+         Dummy_no_info = case_when(Treatment_new == 3 ~ 1, TRUE~0)) %>% 
+  mutate(Matching_Group = case_when(Dummy_Treated == 1 & PredictedGroup == 1 ~"Treat Pred",
+                                    Dummy_Treated == 1 & PredictedGroup == 0 ~"Treat Not Pred",
+                                    Treatment_A == "Vol_Treated" & PredictedGroup == 1 ~ "Opt Pred",
+                                    Treatment_A == "Vol_Treated" & PredictedGroup == 0 ~ "Opt Not Pred",
+                                    Treatment_new == 6 & PredictedGroup == 0 ~ "Control Pred",
+                                    TRUE ~ "Control Not Pred"))
+
+data <- database %>%
  group_by(id) %>%
  dplyr::slice(1) %>%
  ungroup()
@@ -28,6 +50,11 @@ summary(ols_time_spent_D)
 ols_time_spent_control_D<- lm( interviewtime_net_clean ~ as.factor(Treatment_D) + Z_Mean_NR + as.factor(Gender)+Age_mean+ QFIncome + Uni_degree,data)
 summary(ols_time_spent_control_D)

+ols_time_spent_control_G<- lm( interviewtime_net_clean ~ as.factor(Matching_Group) + Z_Mean_NR + as.factor(Gender)+Age_mean+ QFIncome + Uni_degree,data)
+summary(ols_time_spent_control_G)
+
+
+
 ols_time_cc_A<- lm( CC_time_mean_clean ~ as.factor(Treatment_A) ,data)
 summary(ols_time_cc_A)
 ols_time_cc_control_A<- lm( CC_time_mean_clean ~ as.factor(Treatment_A) + Z_Mean_NR + as.factor(Gender)+Age_mean + QFIncome +Uni_degree,data)
@@ -45,6 +72,10 @@ ols_time_cc_D<- lm( CC_time_mean_clean ~ as.factor(Treatment_D) ,data)
 summary(ols_time_cc_D)
 ols_time_cc_control_D<- lm( CC_time_mean_clean ~ as.factor(Treatment_D) + Z_Mean_NR + as.factor(Gender)+Age_mean + QFIncome +Uni_degree,data)
 summary(ols_time_cc_control_D)
+
+
+ols_time_cc_control_G<- lm( CC_time_mean_clean ~ as.factor(Matching_Group) + Z_Mean_NR + as.factor(Gender)+Age_mean + QFIncome +Uni_degree,data)
+summary(ols_time_cc_control_G)
 # # Create an HTML results table with customized names and stars
 # results_table_5 <- stargazer(
 #   ols_tme_spent_1, ols_tme_spent_control_1, 

--- a/Scripts/visualize_distr_spli.R
+++ b/Scripts/visualize_distr_spli.R
 # Load models
-MXL_wtp_Control_Not_Pred_model <- readRDS("C:/nextcloud/Hot_Topic_Cool_Choices/Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Control_Not_Pred_model.rds")
-MXL_wtp_Control_Pred_model <- readRDS("C:/nextcloud/Hot_Topic_Cool_Choices/Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Control_Pred_model.rds")
-MXL_wtp_Opt_Not_Pred_model <- readRDS("C:/nextcloud/Hot_Topic_Cool_Choices/Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Opt_Not_Pred_model.rds")
-MXL_wtp_Opt_Pred_model <- readRDS("C:/nextcloud/Hot_Topic_Cool_Choices/Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Opt_Pred_model.rds")
-MXL_wtp_Treated_Not_Pred_model <- readRDS("C:/nextcloud/Hot_Topic_Cool_Choices/Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Treated_Not_Pred_model.rds")
-MXL_wtp_Treated_Pred_model <- readRDS("C:/nextcloud/Hot_Topic_Cool_Choices/Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Treated_Pred_model.rds")
+MXL_wtp_Control_Not_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Control_Not_Pred")
+MXL_wtp_Control_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Control_Pred")
+MXL_wtp_Opt_Not_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Opt_Not_Pred")
+MXL_wtp_Opt_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Opt_Pred")
+MXL_wtp_Treated_Not_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Treated_Not_Pred")
+MXL_wtp_Treated_Pred_model <- apollo_loadModel("Estimation_results/mxl/prediction/Split_samples/MXL_wtp_Treated_Pred")

 # List of models
 models <- list(
@@ -17,39 +17,62 @@ models <- list(
 )

 # Define the x-range for plotting (1000 values)
-x_range <- seq(-10, 100, length.out = 1000)
+x_range <- seq(-20, 100, length.out = 1000)

-# Define colors and line types for each model
-colors <- c("blue", "red", "green", "purple", "orange", "brown")
-line_types <- 1:6
+# Define an empty data frame to store the results
+plot_data <- data.frame()

-# Initialize the plot with the first model
-model <- models[["Treated Not Pred"]]
-coef <- summary(model)$estimate
-mean <- coef["mu_natural", "Estimate"]
-sd <- coef["sig_natural", "Estimate"]
-y <- dnorm(x_range, mean, sd)
-
-plot(x_range, y, type = "l", col = colors[1], lwd = 2, lty = line_types[1],
-     ylim = c(0, max(y)), xlab = "x", ylab = "Density", 
-     main = "Normal Distributions from Multiple MXL Models")
-
-# Loop through the remaining models and add their distributions to the plot
-for (i in 2:length(models)) {
+# Loop through the models and calculate the density for each
+for (i in seq_along(models)) {
+  model_name <- names(models)[i]
  model <- models[[i]]
  
  # Extract mean and standard deviation for 'mu_natural' and 'sig_natural'
-  coef <- summary(model)$estimate
-  mean <- coef["mu_natural", "Estimate"]
-  sd <- coef["sig_natural", "Estimate"]
+  coef <- model$estimate
+  mean <- coef["mu_natural"]
+  sd <- coef["sig_natural"]
  
-  # Compute the normal distribution based on extracted mean and sd
+  # Compute the normal distribution based on the extracted mean and sd
  y <- dnorm(x_range, mean, sd)
  
-  # Add the distribution to the existing plot
-  lines(x_range, y, col = colors[i], lwd = 2, lty = line_types[i])
+  # Create a data frame for this model and append it to the main data frame
+  temp_data <- data.frame(x = x_range, density = y, model = model_name)
+  plot_data <- bind_rows(plot_data, temp_data)
+}
+
+# Plot using ggplot2
+ggplot(plot_data, aes(x = x, y = density, color = model)) +
+  geom_line() + 
+  labs(
+    title = "Normal Distributions from Multiple MXL Models",
+    x = "x",
+    y = "Density",
+    color = "Model"
+  ) +
+  theme_minimal() +
+  theme(legend.position = "right") +
+  scale_color_brewer(palette = "Set1")
+
+#### Z-test if distributions are different
+
+z_value <- (MXL_wtp_Treated_Pred_model$estimate["mu_natural"] - MXL_wtp_Control_Pred_model$estimate["mu_natural"])/
+  (sqrt((MXL_wtp_Treated_Pred_model$estimate["sig_natural"]/sqrt(MXL_wtp_Treated_Pred_model$nObs)) + 
+     (MXL_wtp_Control_Pred_model$estimate["sig_natural"]/sqrt(MXL_wtp_Control_Pred_model$nObs))))
+
+
+# Function to calculate z-value using your exact formula
+calculate_z_value <- function(model_1, model_2) {
+  # Calculate the z-value using your specific formula
+  z_value <- (model_1$estimate["mu_natural"] - model_2$estimate["mu_natural"]) /
+    (sqrt((model_1$estimate["sig_natural"] / sqrt(model_1$nObs/10)) + 
+       (model_2$estimate["sig_natural"] / sqrt(model_2$nObs/10))))
+  
+  return(z_value)
 }

-# Add a legend
-legend("topright", inset = c(-0.25, 0), legend = names(models), col = colors, 
-       lty = line_types, lwd = 2, xpd = TRUE)
+# Example of using the function
+z_value <- calculate_z_value(MXL_wtp_Treated_Pred_model, MXL_wtp_Control_Pred_model)
+
+# Print the z-value
+z_value
+