rm(list = ls()) library(ggplot2) dat <- data.frame(matrix(ncol = 2, nrow = 500)) colnames(dat) <- c("Z", "Y") dat$Z <- c(rep(0, 250), rep(1, 250)) dat$Y[1:250] <- round(rnorm(250, mean = 9, sd = 1.5)) dat$Y[251:500] <- round(rnorm(250, mean = 9, sd = 1.5)) ggplot(data = dat, aes(x = Z, y = Y)) + geom_jitter(width = 0.05) + geom_point(aes(x = c(1), y = c(9)), color = "red", size = 12) + geom_point(aes(x = c(0), y = c(9)), color = "red", size = 12) + scale_x_continuous(breaks = c(0, 1)) + scale_y_continuous(limits = c(0, 15)) + theme_bw() # Let's estimate a few parameters by hand ## Difference in means between treatment and control mean_control <- mean(dat$Y[dat$Z == 0]) mean_treatment <- mean(dat$Y[dat$Z == 1]) mean_diff <- mean_treatment - mean_control mean_diff # standard error of difference in means se_mean_diff <- sqrt( (var(dat$Y[dat$Z == 1]) / 250) + # variance of treatment observations / n_treatment (var(dat$Y[dat$Z == 0]) / 250) # variance of control observations / n_control ) # Let's run simple linear regression and look at the coefficients fit_dat <- lm(Y ~ Z, data = dat) summary(fit_dat) mean_treat <- fit_dat$coefficients[1] + fit_dat$coefficients[2] # Heteroskedasticity robust standard error ## What is heteroskedasticity? set.seed(4358) dat_het <- data.frame(matrix(ncol = 2, nrow = 500)) colnames(dat_het) <- c("Z", "Y") dat_het$Z <- c(rep(0, 250), rep(1, 250)) dat_het$Y[1:250] <- round(rnorm(250, mean = 6, sd = 1.5)) dat_het$Y[251:500] <- round(rnorm(250, mean = 9, sd = 7.5)) set.seed(3007) ggplot(data = dat_het, aes(x = Z, y = Y)) + geom_jitter(width = 0.05) + geom_point(aes(x = c(1), y = c(9)), color = "red", size = 12) + geom_point(aes(x = c(0), y = c(6)), color = "red", size = 12) + scale_x_continuous(breaks = c(0, 1)) + scale_y_continuous(limits = c(-15, 35)) + theme_bw() ## Difference in means between treatment and control mean_control_het <- mean(dat_het$Y[dat_het$Z == 0]) mean_treatment_het <- mean(dat_het$Y[dat_het$Z == 1]) mean_diff_het <- mean_treatment_het - mean_control_het mean_diff_het # standard error of difference in means se_mean_diff_het <- sqrt( (var(dat_het$Y[dat_het$Z == 1]) / 250) + # variance of treatment observations / n_treatment (var(dat_het$Y[dat_het$Z == 0]) / 250) # variance of control observations / n_control ) # linear regression fit_dat_het <- lm(Y ~ Z, data = dat_het) summary(fit_dat_het) ## Violation of the homoskedasticity assumption... ## Heteroskedasticity may introduce bias in standard errors ## Experimental setting: always use heteroskedasticity robust SEs! ## Observational setting / modeling: look at disagreements between SEs and Robust SEs ### Disagreement may indicate ### If you want to go deeper into the debates: ### Read: #### 1. King, Gary and Roberts, Margaret E. 2015. How Robust Standard Errors Expose Methodological Problems They Do Not Fix, and What to Do About It. Political Analysis (2015) 23:159-179. #### 2. Aronow, Peter. 2016. A Note on "How Robust Standard Errors Expose Methodological Problems They Do Not Fix, and What to Do About It". ArXiv # For robust SEs, use the lm_robust() function from estimatr package library(estimatr) lm_robust(Y ~ Z, data = dat_het) fit_dat_het_robust <- tidy(lm_robust(Y ~ Z, data = dat_het)) fit_dat_het_robust # in this specific case, SEs agree! class(fit_dat_het_robust) # Let's calculate SEs by hand and compare output ci_lower <- mean_diff_het - 1.96 * se_mean_diff_het ci_upper <- mean_diff_het + 1.96 * se_mean_diff_het