library("Stat2Data")
library("mosaic")
data("ICU")

###### We are trying to select a model to predict
###### Survival (1 = yes, 0 = no) from a set of predictors
###### available at the ICU

##### Setting aside a "test set"
## I'm going to remove a "test set" that I won't look at
## until I've narrowed down to 3 or so models

set.seed(29747)
n <- nrow(ICU)
test.size <- round(0.20 * n)
test.size
## Randomly choose 0.20 * n row numbers
test.rows <- sample(1:nrow(ICU), size = test.size)
## Pick out the rows corresponding to those numbers
ICUTest <- ICU[test.rows, ]
## Use the remaining rows to fit models on
ICUTrain <- ICU[-test.rows, ]

### Stepwise Selection

## Note: Would be better to choose models using domain knowledge,
## but this will give us some "finalists"
full.model <- glm(Survive ~ Age + factor(Sex) + factor(Infection) +
                    SysBP + Pulse, family = "binomial", data = ICUTrain)
null.model <- glm(Survive ~ 1, family = "binomial", data = ICUTrain)

step(null.model, scope = list(upper = full.model))

###### Best-Fitting Models
# (1) ~ Age + SysBP + factor(Infection)
# (2) ~ Age + SysBP
# (3) ~ Age + SysBP + factor(Infection) + factor(Sex)
# (4) ~ Age + SysBP + factor(Infection) + Pulse
# (5) ~ Age + factor(Infection)

formula1 <- Survive ~ Age + SysBP + factor(Infection)
formula2 <- Survive ~ Age + SysBP
formula3 <- Survive ~ Age + SysBP + factor(Infection) + factor(Sex)
formula4 <- Survive ~ Age + SysBP + factor(Infection) + Pulse
formula5 <- Survive ~ Age + factor(Infection)


###### Cross-Validation

### We'll define some different metrics to evaluate the quality
### of the predictions made by a model

### "Hard" prediction error: 
###    1 if fitted probability is on the wrong side of 0.5
###    0 otherwise
### In other words, counts number of "misclassified points"
hard.prediction.error <- function(model, newdata, response)
{
    errors <- 
      predict(model, newdata, type = "response") - newdata[,response] %>% 
        abs()
    return((errors >= 0.5) %>% sum())
}

hard.prediction.error(
  model = full.model,
  newdata = ICUTest,
  response = "Survive") / nrow(ICUTest)

### "Soft" Prediction Error = absolute residual: |y - pi|
###     Rewards confident correct predictions
###     Penalizes confident wrong predictions
soft.prediction.error <- function(model, newdata, response)
{
    errors <- abs(predict(model, newdata, type = "response") - newdata[,response])
    return(errors %>% sum())
}

soft.prediction.error(
  model = full.model,
  newdata = ICUTest,
  response = "Survive") / nrow(ICUTest)

nrow(ICUTrain)
### Define a function that takes:
###   formula: a model formula
###   data: a dataset
###   response: the name of the response variable
###   metric: a function that returns the error for a point
logistic.loocv.error <- function(formula, data, response, metric)
{
    n <- nrow(data)
    ### Initialize a running error total to 0
    total.error <- 0
    for(i in 1:n)
    {
        ## fit the model on the data with case i held out
        model <- glm(formula = formula, family = "binomial", data = data[-i,])
        ### Add the total error from point i to the running total
        total.error <- total.error + metric(model, data[i,], response)
    }
    return(total.error)
}

###### Do cross-validation for a model with five predictors

formulas <- c(formula1, formula2, formula3, formula4, formula5)
loocv.errors <- numeric(5)
names(formulas) <- c("model1", "model2", "model3", "model4", "model5")
names(loocv.errors) <- names(formulas)

### First using "hard" prediction error
for(i in 1:5)
{
    loocv.errors[i] <- logistic.loocv.error(
        formula = formulas[[i]], 
        data = ICUTrain, 
        response = "Survive", 
        metric = hard.prediction.error)
}

loocv.errors

### Now using "soft" prediction error
for(i in 1:5)
{
  loocv.errors[i] <- logistic.loocv.error(
    formula = formulas[[i]], 
    data = ICUTrain, 
    response = "Survive", 
    metric = soft.prediction.error)
}
                
loocv.errors

model1 <- glm(formula1, data = ICUTrain, family = "binomial")
model3 <- glm(formula3, data = ICUTrain, family = "binomial")
model4 <- glm(formula4, data = ICUTrain, family = "binomial")

hard.prediction.error(model1, newdata = ICUTest, response = "Survive")
hard.prediction.error(model3, newdata = ICUTest, response = "Survive")
hard.prediction.error(model4, newdata = ICUTest, response = "Survive")

soft.prediction.error(model1, newdata = ICUTest, response = "Survive")
soft.prediction.error(model3, newdata = ICUTest, response = "Survive")
soft.prediction.error(model4, newdata = ICUTest, response = "Survive")