## code to accompany the reading comprehension example from ## lecture 12 of PSTAT 215A, Bayesian Inference, UCSB ## ## Robert B. Gramacy ## load necessary libraries library(mvtnorm) ## read in the data file and extract sufficient stats o2uptake <- read.table("o2uptake.txt", header=TRUE) ## make a 2-d plot of the data a <- o2uptake$program == "aerobic" plot(o2uptake$age[a], o2uptake$uptake[a], main="", bty="n", xlab="age", ylab="deta-o2-uptake", xlim=range(o2uptake$age), ylim=range(o2uptake$uptake)) points(o2uptake$age[!a], o2uptake$uptake[!a], pch=19) legend("right", c("aerobic", "running"), pch=c(21,19)) ## construct design matrix (X) x1 <- rep(1, nrow(o2uptake)) x2 <- o2uptake$program == "aerobic" x3 <- o2uptake$age x4 <- x2*x3 X <- cbind(x1,x2,x3,x4) p <- ncol(X) ## extract the data y <- o2uptake$uptake n <- length(y) ## maximum likelihood estimator XtXi <- solve(t(X) %*% X) beta.hat <- XtXi %*% t(X) %*% y beta.hat ## maximum likelihood estimator for s2 s2.hat <- mean((y - X %*% beta.hat)^2) ## extract the program-specific lines int.a <- beta.hat[1] slope.a <- beta.hat[3] int.r <- sum(beta.hat[1:2]) slope.r <- sum(beta.hat[3:4]) abline(int.a, slope.a, lwd=2) abline(int.r, slope.r, lty=2, lwd=2) ## we could use the formulas to calculate the sampling ## distributions, etc., sqrt(diag(XtXi)*s2.hat*n/(n-p)) ## but it is easier to do this with commands in R, e.g., fit <- lm(y~X-1) summary(fit) ## MC estimation for the Bayesian model ## number of samples S <- 1000 ## draw from the marginal posterior for s2 s2 <- 1/rgamma(S, (n-p)/2, n*s2.hat/2) ## draw from the conditional posterior for beta|s2 mu.n <- beta.hat beta <- matrix(NA, nrow=S, ncol=p) for(s in 1:S) { beta[s,] <- rmvnorm(1, mu.n, s2[s] * XtXi) } ## make a plot of the beta posterior par(mfrow=c(2,2)) hist(beta[,1], main="") points(beta.hat[1], 0, col=2, pch=19) hist(beta[,2], main="") points(beta.hat[2], 0, col=2, pch=19) hist(beta[,3], main="") points(beta.hat[3], 0, col=2, pch=19) hist(beta[,4], main="") points(beta.hat[4], 0, col=2, pch=19) ## calculate the posterior means and sds of beta apply(beta, 2, mean) apply(beta, 2, sd) ## posterior quantiles apply(beta, 2, quantile, c(0.025, 0.975)) ## posterior distribution of beta_2 + beta_4 x ## for each age x <- min(o2uptake$age):max(o2uptake$age) aerobics.effect <- data.frame(matrix(NA, nrow=S, ncol=length(x))) for(i in 1:length(x)) { aerobics.effect[,i] <- beta[,2] + beta[,4]*x[i] } names(aerobics.effect) <- x ## qboxplot: ## ## a function for plotting a simplified box ## and wisker plot, copied from Peter Hoff qboxplot <- function(x,at=0,width=.5,probs=c(.025,.25,.5,.75,.975)) { qx<-quantile(x,probs=probs) segments(at,qx[1],at,qx[5]) polygon(x=c(at-width,at+width,at+width,at-width), y=c(qx[2],qx[2],qx[4],qx[4]) ,col="gray") segments(at-width,qx[3],at+width,qx[3],lwd=3) segments(at-width/2,qx[1],at+width/2,qx[1],lwd=1) segments(at-width/2,qx[5],at+width/2,qx[5],lwd=1) } ## making the boxplot(s) for beta_2 + beta_4 x plot(range(x),range(y), type="n", xlab="age", bty="n", ylab="beta[2] + beta[4]*x") for(i in 1:length(x) ) qboxplot(aerobics.effect[,i], at=x[i] , width=.25) abline(h=0,col="gray") ## sampling from the posterior predictive distribution ## at new X locations ## create a predictive data set age.star <- seq(min(o2uptake$age)-1, max(o2uptake$age)+1, by=0.1) program.star <- levels(o2uptake$program) star <- expand.grid(age.star, program.star) names(star) <- c("age", "program") ## create a predictive design matrix x1.star <- rep(1, nrow(star)) x2.star <- star$program == "aerobic" x3.star <- star$age x4.star <- x2.star*x3.star X.star <- cbind(x1.star,x2.star,x3.star,x4.star) n.star <- nrow(X.star) ## allocate predictive data q2.stars <- q1.stars <- matrix(NA, nrow=S, ncol=n.star) m.star <- X.star %*% beta.hat V.star <- 1 + diag(X.star %*% XtXi %*% t(X.star)) ## sample quantile-based CI for predictive locations for(s in 1:S) { q1.stars[s,] <- qnorm(0.025, m.star, sqrt(V.star*s2[s]), lower.tail=FALSE) q2.stars[s,] <- qnorm(0.975, m.star, sqrt(V.star*s2[s]), lower.tail=FALSE) } ## calculate mean quantiles q1.star <- apply(q1.stars, 2, mean) q2.star <- apply(q2.stars, 2, mean) ## put Bayesian predictive mean plot up, which is the same as ## the MLE mean plot(o2uptake$age[a], o2uptake$uptake[a], main="", bty="n", xlab="age", ylab="deta-o2-uptake", xlim=range(o2uptake$age), ylim=range(o2uptake$uptake)) points(o2uptake$age[!a], o2uptake$uptake[!a], pch=19) legend("bottomright", c("aerobic", "running"), pch=c(21,19)) abline(int.a, slope.a, lwd=2) abline(int.r, slope.r, lty=2, lwd=2) ## add predictive CIs to the plot a.star <- star$program == "aerobic" lines(star$age[a.star], q1.star[a.star], col=2, lwd=2, lty=2) lines(star$age[a.star], q2.star[a.star], col=2, lwd=2, lty=2) lines(star$age[!a.star], q1.star[!a.star], col=2, lwd=2) lines(star$age[!a.star], q2.star[!a.star], col=2, lwd=2)