library("Stat2Data") library("mosaic") data("ICU") ###### We are trying to select a model to predict ###### Survival (1 = yes, 0 = no) from a set of predictors ###### available at the ICU ##### Setting aside a "test set" ## I'm going to remove a "test set" that I won't look at ## until I've narrowed down to 3 or so models set.seed(29747) n <- nrow(ICU) test.size <- round(0.20 * n) test.size ## Randomly choose 0.20 * n row numbers test.rows <- sample(1:nrow(ICU), size = test.size) ## Pick out the rows corresponding to those numbers ICUTest <- ICU[test.rows, ] ## Use the remaining rows to fit models on ICUTrain <- ICU[-test.rows, ] ### Stepwise Selection ## Note: Would be better to choose models using domain knowledge, ## but this will give us some "finalists" full.model <- glm(Survive ~ Age + factor(Sex) + factor(Infection) + SysBP + Pulse, family = "binomial", data = ICUTrain) null.model <- glm(Survive ~ 1, family = "binomial", data = ICUTrain) step(null.model, scope = list(upper = full.model)) ###### Best-Fitting Models # (1) ~ Age + SysBP + factor(Infection) # (2) ~ Age + SysBP # (3) ~ Age + SysBP + factor(Infection) + factor(Sex) # (4) ~ Age + SysBP + factor(Infection) + Pulse # (5) ~ Age + factor(Infection) formula1 <- Survive ~ Age + SysBP + factor(Infection) formula2 <- Survive ~ Age + SysBP formula3 <- Survive ~ Age + SysBP + factor(Infection) + factor(Sex) formula4 <- Survive ~ Age + SysBP + factor(Infection) + Pulse formula5 <- Survive ~ Age + factor(Infection) ###### Cross-Validation ### We'll define some different metrics to evaluate the quality ### of the predictions made by a model ### "Hard" prediction error: ### 1 if fitted probability is on the wrong side of 0.5 ### 0 otherwise ### In other words, counts number of "misclassified points" hard.prediction.error <- function(model, newdata, response) { errors <- predict(model, newdata, type = "response") - newdata[,response] %>% abs() return((errors >= 0.5) %>% sum()) } hard.prediction.error( model = full.model, newdata = ICUTest, response = "Survive") / nrow(ICUTest) ### "Soft" Prediction Error = absolute residual: |y - pi| ### Rewards confident correct predictions ### Penalizes confident wrong predictions soft.prediction.error <- function(model, newdata, response) { errors <- abs(predict(model, newdata, type = "response") - newdata[,response]) return(errors %>% sum()) } soft.prediction.error( model = full.model, newdata = ICUTest, response = "Survive") / nrow(ICUTest) nrow(ICUTrain) ### Define a function that takes: ### formula: a model formula ### data: a dataset ### response: the name of the response variable ### metric: a function that returns the error for a point logistic.loocv.error <- function(formula, data, response, metric) { n <- nrow(data) ### Initialize a running error total to 0 total.error <- 0 for(i in 1:n) { ## fit the model on the data with case i held out model <- glm(formula = formula, family = "binomial", data = data[-i,]) ### Add the total error from point i to the running total total.error <- total.error + metric(model, data[i,], response) } return(total.error) } ###### Do cross-validation for a model with five predictors formulas <- c(formula1, formula2, formula3, formula4, formula5) loocv.errors <- numeric(5) names(formulas) <- c("model1", "model2", "model3", "model4", "model5") names(loocv.errors) <- names(formulas) ### First using "hard" prediction error for(i in 1:5) { loocv.errors[i] <- logistic.loocv.error( formula = formulas[[i]], data = ICUTrain, response = "Survive", metric = hard.prediction.error) } loocv.errors ### Now using "soft" prediction error for(i in 1:5) { loocv.errors[i] <- logistic.loocv.error( formula = formulas[[i]], data = ICUTrain, response = "Survive", metric = soft.prediction.error) } loocv.errors model1 <- glm(formula1, data = ICUTrain, family = "binomial") model3 <- glm(formula3, data = ICUTrain, family = "binomial") model4 <- glm(formula4, data = ICUTrain, family = "binomial") hard.prediction.error(model1, newdata = ICUTest, response = "Survive") hard.prediction.error(model3, newdata = ICUTest, response = "Survive") hard.prediction.error(model4, newdata = ICUTest, response = "Survive") soft.prediction.error(model1, newdata = ICUTest, response = "Survive") soft.prediction.error(model3, newdata = ICUTest, response = "Survive") soft.prediction.error(model4, newdata = ICUTest, response = "Survive")