rm(list = ls())

library(ggplot2)


dat <- data.frame(matrix(ncol = 2, nrow = 500))
colnames(dat) <- c("Z", "Y")

dat$Z <- c(rep(0, 250), rep(1, 250))
dat$Y[1:250] <- round(rnorm(250, mean = 9, sd = 1.5))
dat$Y[251:500] <- round(rnorm(250, mean = 9, sd = 1.5))


ggplot(data = dat, aes(x = Z, y = Y)) + 
  geom_jitter(width = 0.05) + 
  geom_point(aes(x = c(1), y = c(9)), color = "red", size = 12) + 
  geom_point(aes(x = c(0), y = c(9)), color = "red", size = 12) + 
  scale_x_continuous(breaks = c(0, 1)) + 
  scale_y_continuous(limits = c(0, 15)) + 
  theme_bw()

# Let's estimate a few parameters by hand 
## Difference in means between treatment and control

mean_control <- mean(dat$Y[dat$Z == 0])
mean_treatment <- mean(dat$Y[dat$Z == 1])

mean_diff <- mean_treatment - mean_control

mean_diff

# standard error of difference in means 

se_mean_diff <- sqrt(
  (var(dat$Y[dat$Z == 1]) / 250) + # variance of treatment observations / n_treatment 
    (var(dat$Y[dat$Z == 0]) / 250) # variance of control observations / n_control 
  )

# Let's run simple linear regression and look at the coefficients

fit_dat <- lm(Y ~ Z, 
              data = dat)

summary(fit_dat)

mean_treat <- fit_dat$coefficients[1] + fit_dat$coefficients[2]


# Heteroskedasticity robust standard error

## What is heteroskedasticity? 

set.seed(4358)
dat_het <- data.frame(matrix(ncol = 2, nrow = 500))
colnames(dat_het) <- c("Z", "Y")

dat_het$Z <- c(rep(0, 250), rep(1, 250))
dat_het$Y[1:250] <- round(rnorm(250, mean = 6, sd = 1.5))
dat_het$Y[251:500] <- round(rnorm(250, mean = 9, sd = 7.5))

set.seed(3007)
ggplot(data = dat_het, aes(x = Z, y = Y)) + 
  geom_jitter(width = 0.05) + 
  geom_point(aes(x = c(1), y = c(9)), color = "red", size = 12) + 
  geom_point(aes(x = c(0), y = c(6)), color = "red", size = 12) + 
  scale_x_continuous(breaks = c(0, 1)) + 
  scale_y_continuous(limits = c(-15, 35)) + 
  theme_bw()

## Difference in means between treatment and control

mean_control_het <- mean(dat_het$Y[dat_het$Z == 0])
mean_treatment_het <- mean(dat_het$Y[dat_het$Z == 1])


mean_diff_het <- mean_treatment_het - mean_control_het

mean_diff_het

# standard error of difference in means 

se_mean_diff_het <- sqrt(
  (var(dat_het$Y[dat_het$Z == 1]) / 250) + # variance of treatment observations / n_treatment 
    (var(dat_het$Y[dat_het$Z == 0]) / 250) # variance of control observations / n_control 
)

# linear regression

fit_dat_het <- lm(Y ~ Z,
                  data = dat_het)

summary(fit_dat_het) 

## Violation of the homoskedasticity assumption...
## Heteroskedasticity may introduce bias in standard errors 
## Experimental setting: always use heteroskedasticity robust SEs!
## Observational setting / modeling: look at disagreements between SEs and Robust SEs
### Disagreement may indicate 
### If you want to go deeper into the debates: 
### Read: 
#### 1. King, Gary and Roberts, Margaret E. 2015. How Robust Standard Errors Expose Methodological Problems They Do Not Fix, and What to Do About It. Political Analysis (2015) 23:159-179.
#### 2. Aronow, Peter. 2016. A Note on "How Robust Standard Errors Expose Methodological Problems They Do Not Fix, and What to Do About It". ArXiv


# For robust SEs, use the lm_robust() function from estimatr package

library(estimatr)

lm_robust(Y ~ Z,
          data = dat_het)

fit_dat_het_robust <- tidy(lm_robust(Y ~ Z,
                                     data = dat_het))

fit_dat_het_robust # in this specific case, SEs agree!

class(fit_dat_het_robust)

# Let's calculate SEs by hand and compare output

ci_lower <- mean_diff_het - 1.96 * se_mean_diff_het
ci_upper <- mean_diff_het + 1.96 * se_mean_diff_het