Assuming the data generating process is:
\(Y = X_{sub}\beta+\varepsilon\)
where \(X_{sub}\) is a subset of the possible variables \(X\), using lasso to screen variables then applying OLS gives a biased estimate of the \(\hat{\beta}\) of interest. (See slides #8.)
beta_X = c()
for (i in 1:100) {
print(i)
dgp = generate_data()
SL.library <- list(c("SL.lm", "screen.glmnet"))
sl_screening_lasso <- SuperLearner(Y = dgp$y,
X = data.frame(x=dgp$x, w=dgp$w), family = gaussian(),
SL.library = SL.library, cvControl = list(V=0))
# why this if ... else ... ?
if (names(sl_screening_lasso$fitLibrary$SL.lm_screen.glmnet$object$coefficients)[2]=="x") {
beta_X = c(beta_X, sl_screening_lasso$fitLibrary$SL.lm_screen.glmnet$object$coefficients[2])
} else {
beta_X = c(beta_X, 0)
}
}
beta_X_df <- data.frame(beta_X=beta_X)
ggplot(beta_X_df, aes(x = beta_X)) + geom_histogram(binwidth = 0.02)
Assuming the data generating process is:
\(Y = X_{sub}\beta+\varepsilon\)
where \(X_{sub}\) is a subset of the possible variables \(X\), we can use Double Selection to get unbiased esimates for the \(\hat{\beta}\) of interest. (See slides #10 and #11.)
beta_X = c()
for (i in 1:100) {
print(i)
dgp = generate_data()
SL.library <- lasso$names
sl_lasso <- SuperLearner(Y = dgp$y,
X = data.frame(x=dgp$x, w=dgp$w),
family = gaussian(),
SL.library = SL.library,
cvControl = list(V=0))
kept_variables <- which(get_lasso_coeffs(sl_lasso)!=0) - 1 # minus 1 as X is listed
kept_variables <- kept_variables[kept_variables>0]
sl_pred_x <- SuperLearner(Y = dgp$x,
X = data.frame(w=dgp$w),
family = gaussian(),
SL.library = lasso$names, cvControl = list(V=0))
kept_variables2 <- which(get_lasso_coeffs(sl_pred_x)!=0)
kept_variables2 <- kept_variables2[kept_variables2>0]
sl_screening_lasso <- SuperLearner(Y = dgp$y,
X = data.frame(x = dgp$x, w = dgp$w[, c(kept_variables, kept_variables2)]),
family = gaussian(),
SL.library = "SL.lm",
cvControl = list(V=0))
beta_X = c(beta_X, coef(sl_screening_lasso$fitLibrary$SL.lm_All$object)[2])
}
beta_X_df <- data.frame(beta_X=beta_X)
ggplot(beta_X_df, aes(x = beta_X)) + geom_histogram(binwidth = 0.02)
Works only in the linear case.
We aim to create a function called doubleml which takes
as input a vector X, a dataframe W and a
vector Y, as well as a value for SL.library.X,
SL.library.Y, family.X and
family.Y. Double ML is almost exactly the same as our
previous esimator, but we switch the samples and repeat the process.
Moreover, we will add a simple estimate of the standard error.
The framework for the function is given as follows:
doubleml <- function(X, W, Y, SL.library.X = "SL.lm", SL.library.Y = "SL.lm", family.X = gaussian(), family.Y = gaussian()) {
### STEP 1: split X, W and Y into 2 random sets
split <- sample(seq_len(length(Y)), size = ceiling(length(Y)/2))
Y1 = Y[split]
Y2 = Y[-split]
X1 = X[split]
X2 = X[-split]
W1 = W[split, ]
W2 = W[-split, ]
### STEP 2a: use a SuperLearner to train a model for E[X|W] on set 1 and predict X on set 2 using this model. Do the same but training on set 2 and predicting on set 1
sl_x1 = SuperLearner(Y = X1,
X = data.frame(W1),
newX= data.frame(W2),
family = family.X,
SL.library = SL.library.X,
cvControl = list(V=0))
x_hat_2 <- sl_x1$SL.predict
sl_x2 = SuperLearner(Y = X2,
X = data.frame(W2),
newX= data.frame(W1),
family = family.X,
SL.library = SL.library.X,
cvControl = list(V=0))
x_hat_1 <- sl_x2$SL.predict
### STEP 2b: get the residuals X - X_hat on set 2 and on set 1
res_x_2 <- X2 - x_hat_2
res_x_1 <- X1 - x_hat_1
### STEP 3a: use a SuperLearner to train a model for E[Y|W] on set 1 and predict Y on set 2 using this model. Do the same but training on set 2 and predicting on set 1
sl_y1 = SuperLearner(Y = Y1,
X = data.frame(W1),
newX= data.frame(W2),
family = family.Y,
SL.library = SL.library.Y,
cvControl = list(V=0))
y_hat_2 <- sl_y1$SL.predict
sl_y2 = SuperLearner(Y = Y2,
X = data.frame(W2),
newX= data.frame(W1),
family = family.Y,
SL.library = SL.library.Y,
cvControl = list(V=0))
y_hat_1 <- sl_y2$SL.predict
### STEP 3b: get the residuals Y - Y_hat on set 2 and on set 1
res_y_2 <- Y2 - y_hat_2
res_y_1 <- Y1 - y_hat_1
### STEP 4: regress (Y - Y_hat) on (X - X_hat) on set 1 and on set 2, and get the coefficients of (X - X_hat)
beta1 = (mean(res_x_1*res_y_1))/(mean(res_x_1*res_x_1))
beta2 = (mean(res_x_2*res_y_2))/(mean(res_x_2*res_x_2))
### STEP 5: take the average of these 2 coefficients from the 2 sets (= beta)
beta=0.5*(beta1+beta2)
### STEP 6: compute standard errors (done for you). This is just the usual OLS standard errors in the regression res_y = res_x*beta + eps.
psi_stack = c((res_y_1-res_x_1*beta), (res_y_2-res_x_2*beta))
res_stack = c(res_x_1, res_x_2)
se = sqrt(mean(res_stack^2)^(-2)*mean(res_stack^2*psi_stack^2))/sqrt(length(Y))
return(c(beta,se))
}
We can now test this function on our dataset:
doubleml(X=dgp$x, W=dgp$w, Y=dgp$y, SL.library.X = "SL.xgboost", SL.library.Y = "SL.xgboost")
We can do this 100 times to demonstrate the performance of double ML.
beta_X = c()
se_X = c()
for (i in 1:100) {
tryCatch({
print(i)
dgp = generate_data()
DML = doubleml(X=dgp$x, W=dgp$w, Y=dgp$y, SL.library.X = "SL.xgboost", SL.library.Y = "SL.xgboost")
beta_X = c(beta_X, DML[1])
se_X = c(se_X, DML[2])
}, error = function(e) {
print(paste("Error for", i))
})
}
beta_X_df <- data.frame(beta_X=beta_X)
ggplot(beta_X_df, aes(x = beta_X)) + geom_histogram(binwidth = 0.02)
print(mean(se_X))
Double ML using xgboost and 4000
simulations