library(data.table)
data <- fread("synthetic_data.csv")
data$C1 <- factor(data$C1)
data$C2 <- factor(data$C2)
data$C3 <- factor(data$C3)
data$XC <- factor(data$XC)
y <- data$Y
t <- data$Z
x <- model.matrix(~ ., data[, -c(1:3)])[, -1]
x <- scale(x, scale = FALSE) # center the regressors
Look for the significant interactions.
naive.model <- lm(y ~ t * x)
## summary(naive.model)
df.naive <- cbind(summary(naive.model)$coef[, c(1, 4)], confint(naive.model))[29:54, ]
First step: marginal regressions on X.
library(grf)
set.seed("20180511")
mu.y <- regression_forest(x, y, tune.parameters = TRUE)
mu.t <- regression_forest(x, t, tune.parameters = TRUE)
y.tilde <- y - predict(mu.y)$predictions
t.tilde <- t - predict(mu.t, x)$predictions
Overall treatment effect
summary(lm(y.tilde ~ t.tilde))
##
## Call:
## lm(formula = y.tilde ~ t.tilde)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.52019 -0.35967 0.00218 0.32686 1.61176
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0005944 0.0050860 0.117 0.907
## t.tilde 0.2559451 0.0108868 23.510 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5184 on 10389 degrees of freedom
## Multiple R-squared: 0.05051, Adjusted R-squared: 0.05042
## F-statistic: 552.7 on 1 and 10389 DF, p-value: < 2.2e-16
Second step: transform the response/predictors.
# x.tilde <- model.matrix(~ .^2, data.frame(x))[, -1] ## no intercept, all first order interactions
x.tilde <- x
x.tilde <- x.tilde[, apply(x.tilde, 2, sd) > 0]
x.tilde <- x.tilde[, !duplicated(t(x.tilde))]
x.tilde <- scale(x.tilde, scale = FALSE) * as.vector(t.tilde) ## make the mean of x.tilde zero
y.tilde <- lm(y.tilde ~ t.tilde)$residuals
full.model <- lm(y.tilde ~ x.tilde - 1)
## summary(full.model)
df.full <- cbind(summary(full.model)$coef[, c(1, 4)], confint(full.model))
df.univariate <- matrix(0, ncol(x.tilde), 4)
for (j in 1:ncol(x.tilde)) {
df.univariate[j, 1] <- coef(lm(y.tilde ~ x.tilde[, j] - 1))
df.univariate[j, 3:4] <- confint(lm(y.tilde ~ x.tilde[, j] - 1))
df.univariate[j, 2] <- summary(lm(y.tilde ~ x.tilde[, j] - 1))$coef[1, 4]
}
rownames(df.univariate) <- colnames(x.tilde)
colnames(df.univariate) <- c("estimate", "p.value", "CI.low", "CI.up")
## df.univariate
Lasso
## first scale x.tilde, as recommended in high-dimensional regression
library(selectiveInference)
## Loading required package: glmnet
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-16
## Loading required package: intervals
##
## Attaching package: 'intervals'
## The following object is masked from 'package:Matrix':
##
## expand
## Loading required package: survival
x.tilde.scale <- apply(x.tilde, 2, sd)
x.tilde <- scale(x.tilde, center = FALSE, scale = x.tilde.scale)
gfit <- glmnet(x.tilde, y.tilde, standardize = FALSE, intercept = FALSE)
eps <- matrix(rnorm(nrow(x.tilde) * 1000, sd = sd(lm(y.tilde ~ x.tilde)$residuals)), ncol = 1000)
lambda <- 1.1 * mean(apply(abs(t(x.tilde) %*% eps), 2, max))
beta <- coef(gfit, s = lambda/nrow(x.tilde), exact=TRUE, x = x.tilde, y = y.tilde)[-1]
out <- fixedLassoInf(x.tilde, y.tilde, beta, lambda, intercept = FALSE, alpha = 0.05)
out$ci <- t(scale(t(out$ci), FALSE, x.tilde.scale[out$vars]))
out$coef0 <- out$coef0 / x.tilde.scale[out$vars]
out.lasso <- out
out.pretty <- round(data.frame(cbind(out$vars, out$coef0, out$ci, out$pv)), 4)
out.pretty <- cbind(out.pretty, symnum(out$pv, corr = FALSE, na = FALSE, cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1), symbols = c("***", "**", "*", ".", " ")))
colnames(out.pretty) <- c("var", "estimate", "ci.low", "ci.up", "p.value", "")
out.pretty$var <- colnames(x.tilde)[out.pretty$var]
## out.pretty
df.lasso <- out.pretty[, c(1, 2, 5, 3, 4)]
Inference ignoring model selection
x.tilde <- scale(x.tilde, scale = 1/x.tilde.scale)
snooping.model <- lm(y.tilde ~ x.tilde[, out.lasso$vars] - 1)
df.lasso.snooping <- data.frame(colnames(x.tilde)[out.lasso$vars], summary(snooping.model)$coef[, c(1, 4)], confint(snooping.model))
df.naive <- data.frame(colnames(x.tilde), df.naive)
colnames(df.naive) <- colnames(df.lasso)
df.full <- data.frame(colnames(x.tilde), df.full)
colnames(df.full) <- colnames(df.lasso)
df.univariate <- data.frame(colnames(x.tilde), df.univariate)
colnames(df.univariate) <- colnames(df.lasso)
colnames(df.lasso.snooping) <- colnames(df.lasso)
df.naive$method <- "Naive"
df.full$method <- "Full"
df.univariate$method <- "Marginal"
df.lasso$method <- "Lasso"
df.lasso.snooping$method <- "Snooping"
df <- rbind(df.naive, df.full, df.univariate, df.lasso, df.lasso.snooping)
rownames(df) <- NULL
df$method <- factor(df$method, c("Naive", "Marginal", "Full", "Lasso", "Lasso (Snooping)"))
library(ggplot2)
library(scales)
ggplot(df) + aes(x = var, y = estimate, ymin = ci.low, ymax = ci.up, size = pmax(1.96, - qnorm(p.value / 2)), color = (p.value > 0.05)) + geom_point() + geom_errorbar() + facet_grid(method ~ .) + scale_size_continuous(range = c(1, 2), guide = FALSE) + scale_color_discrete(guide = FALSE) + geom_hline(yintercept = 0, linetype = "dotted") + theme_bw(base_size = 15) + xlab("") + ylab("Effect modification") + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))