file_path <- "http://www.statslab.cam.ac.uk/~rds37/teaching/statistical_modelling/"
Movies <- read.csv(paste(file_path, "Movies.csv", sep =""))
MoviesLM <- lm(log(Total.Gross) ~ log(Opening) + Screens + RT + log(Budget))
lev <- hatvalues(MoviesLM)
high_lev <- which(lev > 3*4/nrow(Movies)) # gives the high leverage obs
lev[high_lev] # actually shows their leverage
# Should see that obs 99 has much higher leverage than the rest
Movies[99, ]
# Looking at the covariate values, we see that this is largely because
# the budget was so low for the film
# Now a linear model without this obs
MoviesLM_sub <- lm(log(Total.Gross) ~ log(Opening) + RT + log(Budget), subset=-99)
## Prediction intervals
Movies2010 <- read.csv(paste(file_path, "Movies2010.csv", sep =""))
pred_intervals <- predict(MoviesLM_sub, Movies2010, interval="prediction")
pred_intervals_trans <- exp(pred_intervals)
target <- Movies2010$Total.Gross
# (pred_intervals_trans[, 2] < target) gives a logical vector
# The & performs a copmonentwise AND operation
# Applying the mean function first coerces the logical vector into an vector where
# TRUE -> 1 and FALSE -> 0
mean((pred_intervals_trans[, 2] < target) & (pred_intervals_trans[, 3] > target))
# We can also see how well our model predicts film earnings by
# displaying the true earnings alongside the predicted earnings
cbind(target, pred_intervals_trans)