library("Stat2Data")
library("mosaic")
data("Leukemia")

###### We are trying to select a model to predict
###### Resp: will a Leukemia patient respond to treatment? (1 = yes, 0 = no) 
###### from a set of diagnostic and vitals results

##### Setting aside a "test set"

set.seed(29747) # Don't forget to do this!
n <- nrow(Leukemia)
test.size <- round(0.20 * n)
test.size

to.hold.out <- rep(0:1, times = c(n - test.size, test.size)) %>% 
  sample()

Test <- filter(Leukemia, to.hold.out == 1)
Train <- filter(Leukemia, to.hold.out == 0)

head(Test)

## We'll restrict models to the following predictors:
## * Age (age in years)
## * Temp (highest temperature in 10 * degrees F)
## * Smear (differential percentage of "blasts")
## * Infil (percent of marrow leukemia infiltrate)
## * Index (percentage labeling index of bone marrow leukemia cells)
## * Blasts (absolute number of blasts in thousands)

## NB: it's not a good idea to jump into modeling in a domain
## that you don't have any background in, especially when some of 
## the variables are quite technical in nature.  I chose this example 
## because it has a little bit of "messiness", but not too much.
## But you wouldn't want to do this for a project unless you know
## about Leukemia, or at least about these metrics.

##### Checking for transformations

## We'll check individual (quantitative) 
## variables' distributions first to see whether
## any obviously call for a transformation

bwplot(~Age, data = Train) # reasonably symmetric
histogram(~Age, data = Train)

bwplot(~Temp, data = Train)                # kind of right skewed
bwplot(~sqrt(Temp), data = Train)          # didn't really help
bwplot(~log(Temp), data = Train)           # also didn't help
bwplot(~sqrt(Temp - 980), data = Train)    # transformation has more
histogram(~sqrt(Temp - 980), data = Train) #   impact near 0

bwplot(~Smear, data = Train)               # not bad
histogram(~Smear, data = Train)

bwplot(~Infil, data = Train)               # bit left skewed, but not extreme 
histogram(~Infil, data = Train)

bwplot(~Index, data = Train)               # pretty reasonable
histogram(~Index, data = Train)

bwplot(~Blasts, data = Train)              # that's a lot of skew
histogram(~Blasts, data = Train)           # looks structural, not just outliers

with(Train, plot(Blasts))
with(Train, identify(Blasts))

## Would like a log transform; are there zeroes?
min(~Blasts, data = Train)                 # yup

bwplot(~log(1 + Blasts), data = Train)     # better, but still skewed
bwplot(~log(0.5 + Blasts), data = Train)   # better still
bwplot(~log(0.1 + Blasts), data = Train)   # still better
histogram(~log(0.1 + Blasts), data = Train)

## So we have chosen the following transformations:
## * Temp --> sqrt(Temp - 980) 
## * Blasts --> log(0.1 + Blasts)

##### Checking for multicollinearity

## Are any of these variables redundant with the others?
library(car)
## Since vif() expects a model, we'll fit a model with all predictors,
## only for the purposes of checking multicollinearity
full.model <- 
  glm(Resp ~ Age + sqrt(Temp - 980) + Smear + Infil + Index + log(0.1 + Blasts),
     family = "binomial", data = Train)
vif(full.model)

## Smear and Infil are fairly predictable from the others, 
## but not up to VIF > 5

##### Looking at single predictor models

## The main purpose of this step is to assess whether we might want
## any polynomial terms
library(arm)
age.model <- glm(Resp ~ Age, family = "binomial", data = Train)
binnedplot(fitted(age.model), 
           residuals(age.model, type = "pearson"), nclass = 10)
## Might be a bit of nonlinearity, though it's a small dataset

temp.model <- glm(Resp ~ sqrt(Temp - 980), family = "binomial", data = Train)
binnedplot(fitted(temp.model), 
           residuals(temp.model, type = "pearson"), nclass = 10)
## Looks OK

smear.model <- glm(Resp ~ Smear, family = "binomial", data = Train)
binnedplot(fitted(smear.model), 
           residuals(smear.model, type = "pearson"), nclass = 10)
## Tough call

index.model <- glm(Resp ~ Index, family = "binomial", data = Train)
binnedplot(fitted(index.model), 
           residuals(index.model, type = "pearson"), nclass = 10)
## OK

infil.model <- glm(Resp ~ Infil, family = "binomial", data = Train)
binnedplot(fitted(infil.model), 
           residuals(infil.model, type = "pearson"), nclass = 10)
## Probably OK

blasts.model <- glm(Resp ~ log(0.1 + Blasts), family = "binomial", data = Train)
binnedplot(fitted(blasts.model), 
           residuals(blasts.model, type = "pearson"), nclass = 10)
## Could be some curvature, but probably OK

## So, we may want to consider adding Age^2 and Smear^2

## Note: Would be better to choose models using domain knowledge,
## but we'll use a "quick and dirty" method here for expediency
## (Best subsets not available in leaps package for logistic regression, 
## so we'll use stepwise)

## Using poly() here forces the quadratic and linear terms to be considered
## together.  We can consider removing quadratics later.
full.model <- 
  glm(Resp ~ poly(Age,2) + sqrt(Temp - 980) + poly(Smear,2) + Infil + 
             Index + log(0.1 + Blasts), family = "binomial", data = Train)

null.model <- glm(Resp ~ 1, family = "binomial", data = Train)

step(null.model, scope = list(upper = full.model))

###### No clear "cliff", so we'll consider top three models visited
formula1 <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2) + Infil
formula2 <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2)
formula3 <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2) + poly(Smear,2)

##### Check polynomial terms for possible removal
formula1b <- Resp ~ Index + sqrt(Temp - 980) + Age + Infil
model1 <- glm(formula1, family = "binomial", data = Train)
model1b <- glm(formula1b, family = "binomial", data = Train)

anova(model1b, model1, test = "LRT")
AIC(model1)
AIC(model1b)
## Ambiguous: AIC says "keep it"; but nested test yields P > 0.1

## How do the residuals look without the quadratic?
binnedplot(fitted(model1b), residuals(model1b, type = "pearson"))
## They look OK

## What about in the context of model 2?
formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age
model2 <- glm(formula2, family = "binomial", data = Train)
model2b <- glm(formula2b, family = "binomial", data = Train)

anova(model2b, model2, test = "LRT")
AIC(model2)
AIC(model2b)
## Again ambiguous, though the AIC edge for the more complex model is tiny

## Check the residuals without Age^2...
binnedplot(fitted(model2b), residuals(model2b, type = "pearson"))
## Looks pretty OK

## Tie-breaker goes to parsimony.  Let's get rid of Age^2.

## What about the quadratics in model3?
formula3b <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2) + Smear
formula3c <- Resp ~ Index + sqrt(Temp - 980) + Age + poly(Smear, 2)
formula3d <- Resp ~ Index + sqrt(Temp - 980) + Age + Smear

model3 <- glm(formula3, family = "binomial", data = Train)
model3b <- glm(formula3b, family = "binomial", data = Train)
model3c <- glm(formula3c, family = "binomial", data = Train)
model3d <- glm(formula3d, family = "binomial", data = Train)

anova(model3b, model3, test = "LRT")  # Checking Smear^2 if Age^2 is there
anova(model3d, model3c, test = "LRT") # Checking Smear^2 if Age^2 isn't there
anova(model3c, model3, test = "LRT")  # Checking Age^2 if Smear^2 is there
anova(model3d, model3b, test = "LRT") # Checking Age^2 if Smear^2 isn't there

AIC(model3)
AIC(model3b)
AIC(model3c)
AIC(model3d)

## Seems clear we can safely discard Smear^2.  Age^2 is less clear, since
## the lowest AIC model has it present, but none of the hypothesis tests
## suggest there is strong evidence for it

## Let's check binned residuals for model3d
binnedplot(fitted(model3d), residuals(model3d, type = "pearson"))
## They look OK, so we're again not worried about lack of linearity without Age^2

## So, we have revised our pool to
formula1b <- Resp ~ Index + sqrt(Temp - 980) + Age + Infil
formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age
formula3d <- Resp ~ Index + sqrt(Temp - 980) + Age + Smear

##### Checking for possible interactions

## One question we might have is whether the relationship between
## any of the biological markers and the response to treatment is
## modulated by Age.  That is, is there an interaction between
## Age and any of the other variables?

formula4 <- Resp ~ (Index + sqrt(Temp - 980) + Infil + Smear) * Age
model4 <- glm(formula4, family = "binomial", data = Train)

## We can check combinations of interactions using backward elimination
## starting from model4

step(model4, direction = "backward")

## Looks like we can remove all of them.  The best fitting models wind up
## being model 1b and model 3d, which we already have in our pool

## So our list at this point is
formula1b <- Resp ~ Index + sqrt(Temp - 980) + Age + Infil
formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age
formula3d <- Resp ~ Index + sqrt(Temp - 980) + Age + Smear

finalist.formulas <- c(formula1b, formula2b, formula3d)
models <- lapply(finalist.formulas, glm, family = "binomial", data = Train)
  
## Quick check the AICs
lapply(models, AIC)

## The narrow leader is model2b, which is also the simplest

###### Cross-Validation
source("http://colindawson.net/stat213/code/helper_functions.R")
source("~/213site/code/helper_functions.R")

#### Some error metrics that make sense for logistic regression:

### Normalized deviance: Residual deviance (on validation set) / sample size
### Can use normalized.deviance() from helper_functions.R

### "Hard" classification rate:
###    Count prediction as an "error" if fitted probability 
###    is on the wrong side of 0.5.
###    Compute proportion of validation cases that are errors
###    Can use hard.classification.error() from helper_functions.R

### False positive rate: What proportion of real 0s are misclassified 
### as 1s? (Ignores true 1s)
### helper_functions.R: false.positive.rate

### False negative rate: What proportion of real 1s are misclassified
### as 0s? (Ignores true 0s)
### helper_functions.R: false.negative.rate

### "Soft" classification error = absolute residual: |y - pi|
###     Rewards confident correct predictions
###     Penalizes confident wrong predictions
###     Compute average error per case in validation set
### Can use soft.classification.error() from helper_functions.R

### Concordance, discordance, and Somers' D
###    Consider all pairs of cases with different response values
##       (that is, one 0 and one 1)
##     Count what proportion of the time the 1 is assigned a higher 
##       predicted prob. than the zero (good).  This is the "concordance"
##     Count what proportion of the time the opposite occurs.
##       This is the "discordance" (bad)
##     Note that these may not add to 1, due to possible tied predicted probs.
##     Somers' D := Concordance - Discordance

finalists <- lapply(finalist.formulas, glm, family = "binomial", data = Train)

set.seed(8675309)
n <- nrow(Train)
## Normalized deviances first
nds <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
              error.metric = normalized.deviance, iterations = 10)
nds
## All three are really close

hces <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
               error.metric = hard.classification.error, iterations = 10)
hces
## The leader is model 5 with 41.4% misclassification
## (which isn't that great)

fprs <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
               error.metric = false.positive.rate, iterations = 10)
fprs

## Lots of false positives for all three, but fewest for model2b

fnrs <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
               error.metric = false.negative.rate, iterations = 10)
fnrs

## No false negatives.  Seems like the models tend to be "optimistic"
## about responding to treatment overall.

sces <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
              error.metric = soft.classification.error, iterations = 10)
sces

## No difference

concordances <- 
  lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
         error.metric = concordance, iterations = 10)
concordances

discordances <- 
  lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
         error.metric = discordance, iterations = 10)
discordances

somers.ds <-
  lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
         error.metric = somers.d, iterations = 10)
somers.ds  

## Model 2b looks best

##### Confirm on the test set
test.hces <- 
  lapply(finalists, get.held.out.error, test.set = Test, 
         response.name = "Resp", error.metric = hard.classification.error)

test.hces

test.fprs <- 
  lapply(finalists, get.held.out.error, test.set = Test, 
         response.name = "Resp", error.metric = false.positive.rate)
test.fprs

test.fnrs <- 
  lapply(finalists, get.held.out.error, test.set = Test, 
         response.name = "Resp", error.metric = false.negative.rate)
test.fnrs

test.sces <- 
  lapply(finalists, get.held.out.error, test.set = Test, 
         response.name = "Resp", error.metric = soft.classification.error)

test.sces

test.nds <- 
  lapply(finalists, get.held.out.error, test.set = Test, 
         response.name = "Resp", error.metric = normalized.deviance)
test.nds

test.somers.ds <-
  lapply(finalists, get.held.out.error, test.set = Test, 
         response.name = "Resp", error.metric = somers.d)
test.somers.ds

## No difference

## There is very little basis to choose any one model over the others.
## Model 1b has the lowest test set deviance, while model 2b has the lowest
## cross-validation deviance; none of the models seem to do better than
## any other on the other measures.  So, having no clear reason to include
## the extra predictor in model 1b, we'll go with model 2b as our "final model"
## (Note also that measuring Infil presumably costs money, and to use model 2b
##  we don't need that information)

formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age

##### Fit on the full dataset and interpret
model2b.full <- 
  glm(Resp ~ Index + sqrt(Temp - 980) + Age, family = "binomial",
      data = Leukemia)

model2b.full %>% coef() %>% round(2)

## Prediction equation:
## log odds(responds to treatment) = 0.81 + 0.35 Index 
##                                   - 0.60 sqrt(Temp - 980) - 0.05 Age

with(Leukemia, concordance(Resp, fitted(model2b.full, type = "response")))

## Odds and odds ratios:
model2b.full %>% coef() %>% exp() %>% round(2)

## It seems that higher Index scores are associated with a higher probability
## of responding to treatment, after controlling for age and body temperature, 
## with the odds of response increasing by about 42% with each 1 point 
## increase in Index relative to what is expected based on age and temperature

## It seems that higher body temperatures are associated with a lower
## chance of recovery (controlling for age and Index), but in the model 
## the relationship flattens out some with higher temperatures.  
## For each one unit increase in the square root of temperature 
## in degrees above 98F relative to what is expected given age and index, 
## the odds of responding to treatment go down 45%

## Older patients have worse odds of responding to treatment, controlling for
## body temperature and Index, with each additional
## year of age, relative to the average for body temperature and index,
## associated with a 5% downward adjustment in the odds of recovery.
## Each additional 10 years of age is associated with odds that are
odds.factor <- 0.95^10
odds.factor
## about `r round(100 * 1 - odds.factor)`% [40%] lower

##### Confidence intervals

## Let's look at the predicted recovery probabilities
## for a few example patients.

f.hat <- makeFun.logistic(model2b)

### Patient 1: 20 years old with a high index score and a normal body
### temperature (favorable scenario)

f.hat(Age = 20, Temp = 986, Index = 20, interval = "confidence")

## We estimate that a young patient with a high index score and a normal body temp
## has at least an 88% chance of responding to treatment.

### Patient 1b: same but with a fever of 101F

f.hat(Age = 20, Temp = 1010, Index = 20, interval = "confidence")

## If a patient like this has a fever, the probability of recovery might
## well still be near 100%, but it might be down to something like 73%

### Patient 2: 50 years old, moderate index score, normal body temp
f.hat(Age = 50, Temp = 986, Index = 10, interval = "confidence")

## Patients like this might have anywhere between a 36% and 79%
## chance of responding

### Patient 2b: fever of 101F
f.hat(Age = 50, Temp = 1010, Index = 10, interval = "confidence")

## The presence of a fever of 101F worsens the outlook considerably.
## The probability of responding goes down to somewhere between 4 and 50%

### Patient 3: 75 years old, fever of 101F, low index (unfavorable scenario)
f.hat(Age = 75, Temp = 1010, Index = 5, interval = "confidence")

## Even if our model is pessimistic, we doubt that the probability
## of responding to treatment is higher than 19% or so for a patient
## like this, and more likely it's less than 1%.