library(data.table)
path <- "http://www.statslab.cam.ac.uk/~rds37/teaching/statistical_modelling/"
Smoking <- fread(file.path(path, "Smoking.csv"))
class(Smoking)
Smoking[, Total := Survived + Died]
Smoking[, Prop.died := Died / Total]
fit1 <- glm(Prop.died ~ Age.group + Smoker, data = Smoking, family = binomial, weights = Total)
summary(fit1)
par(mfrow = c(2, 2))
plot(fit1)
fit2 <- glm(Prop.died ~ Age.group + I(Age.group^2) + Smoker, data = Smoking, family = binomial, weights = Total)
plot(fit2)
newdata <- data.frame(Age.group = rep(21:80, 2),
Smoker = rep(c("Yes", "No"), each = 60))
newdata$Prop.died <- predict(fit2, newdata, type = "response")
newdata$Prop.died.se <- predict(fit2, newdata, type = "response", se.fit = TRUE)$se.fit
library(ggplot2)
qplot(Age.group, Prop.died, data = newdata, colour = Smoker, geom = "line") +
geom_errorbar(aes(ymin = Prop.died - 1.96 * Prop.died.se, ymax = Prop.died + 1.96 * Prop.died.se))
qplot(Age.group, Prop.died, data = newdata, colour = Smoker, geom = "line") +
geom_ribbon(aes(ymin = Prop.died - 1.96 * Prop.died.se, ymax = Prop.died + 1.96 * Prop.died.se, fill = Smoker), linetype = "dotted", alpha = 0.5)
## Poisson GLM
football2014 <- read.csv(file.path(path, "football2014.csv"), stringsAsFactors = TRUE)
head(football2014)
football2014[269, ]
fit <- glm(GoalsScored ~ By + Against + HomeAway, data = football2014, family = poisson)
summary(fit)
coef(fit)[1:5]
football2014$By <- relevel(football2014$By, "Man United")
football2014$Against <- relevel(football2014$Against, "Man City")
fit2 <- glm(GoalsScored ~ By + Against + HomeAway, data = football2014, family = poisson)
coef(fit2)[1:5]
get.matches <- function(football) {
## Check format
first.half <- 1:(nrow(football)/2)
stopifnot(sum(football$By[first.half] != football$Against[- first.half]) == 0)
stopifnot(sum(football$Against[first.half] != football$By[- first.half]) == 0)
stopifnot(sum(football$HomeAway[first.half] != "Home") == 0)
data.table(Home = football$By[first.half],
Away = football$Against[first.half],
Goals.Home = football$GoalsScored[first.half],
Goals.Away = football$GoalsScored[- first.half])
}
get.matches(football2014)
combine.standing <- function(standing1, standing2) {
standing <- rbind(standing1, standing2)[, list(points = sum(points), goals = sum(goals)), by = "Team"]
standing <- standing[order(-points, - goals)]
standing$rank <- 1:nrow(standing)
standing
}
get.standing <- function(matches) {
standing.home <- matches[,
list(points = 3 * sum(Goals.Home > Goals.Away) + 1 * sum(Goals.Home == Goals.Away),
goals = sum(Goals.Home)),
by = "Home"]
standing.away <- matches[,
list(points = 3 * sum(Goals.Away > Goals.Home) + 1 * sum(Goals.Home == Goals.Away),
goals = sum(Goals.Away)),
by = "Away"]
names(standing.home)[1] <- "Team"
names(standing.away)[1] <- "Team"
combine.standing(standing.home, standing.away)
}
standing.current <- get.standing(get.matches(football2014))
## Simulate the rest
remaining <- read.csv(file.path(path, "fixtures_remaining.csv"))
remaining$Goals.predict <- predict(fit, newdata = remaining, type = "response")
standing.simulation <- function() {
remaining$GoalsScored <- rpois(nrow(remaining), remaining$Goals.predict)
combine.standing(standing.current, get.standing(get.matches(remaining)))
}
standing.simulated <- replicate(1000, standing.simulation(), simplify = FALSE)
head(standing.simulated)
standing.simulated <- rbindlist(standing.simulated)
standing.simulated[,
list(rank.mean = mean(rank),
rank.q5 = quantile(rank, .05),
rank.q95 = quantile(rank, .95),
points.mean = mean(points),
goals.means = mean(goals)
),
by = "Team"][order(rank.mean), ]