library("Stat2Data")
library("mosaic")
data("Leukemia")
###### We are trying to select a model to predict
###### Resp: will a Leukemia patient respond to treatment? (1 = yes, 0 = no)
###### from a set of diagnostic and vitals results
##### Setting aside a "test set"
set.seed(29747) # Don't forget to do this!
n <- nrow(Leukemia)
test.size <- round(0.20 * n)
test.size
to.hold.out <- rep(0:1, times = c(n - test.size, test.size)) %>%
sample()
Test <- filter(Leukemia, to.hold.out == 1)
Train <- filter(Leukemia, to.hold.out == 0)
head(Test)
## We'll restrict models to the following predictors:
## * Age (age in years)
## * Temp (highest temperature in 10 * degrees F)
## * Smear (differential percentage of "blasts")
## * Infil (percent of marrow leukemia infiltrate)
## * Index (percentage labeling index of bone marrow leukemia cells)
## * Blasts (absolute number of blasts in thousands)
## NB: it's not a good idea to jump into modeling in a domain
## that you don't have any background in, especially when some of
## the variables are quite technical in nature. I chose this example
## because it has a little bit of "messiness", but not too much.
## But you wouldn't want to do this for a project unless you know
## about Leukemia, or at least about these metrics.
##### Checking for transformations
## We'll check individual (quantitative)
## variables' distributions first to see whether
## any obviously call for a transformation
bwplot(~Age, data = Train) # reasonably symmetric
histogram(~Age, data = Train)
bwplot(~Temp, data = Train) # kind of right skewed
bwplot(~sqrt(Temp), data = Train) # didn't really help
bwplot(~log(Temp), data = Train) # also didn't help
bwplot(~sqrt(Temp - 980), data = Train) # transformation has more
histogram(~sqrt(Temp - 980), data = Train) # impact near 0
bwplot(~Smear, data = Train) # not bad
histogram(~Smear, data = Train)
bwplot(~Infil, data = Train) # bit left skewed, but not extreme
histogram(~Infil, data = Train)
bwplot(~Index, data = Train) # pretty reasonable
histogram(~Index, data = Train)
bwplot(~Blasts, data = Train) # that's a lot of skew
histogram(~Blasts, data = Train) # looks structural, not just outliers
with(Train, plot(Blasts))
with(Train, identify(Blasts))
## Would like a log transform; are there zeroes?
min(~Blasts, data = Train) # yup
bwplot(~log(1 + Blasts), data = Train) # better, but still skewed
bwplot(~log(0.5 + Blasts), data = Train) # better still
bwplot(~log(0.1 + Blasts), data = Train) # still better
histogram(~log(0.1 + Blasts), data = Train)
## So we have chosen the following transformations:
## * Temp --> sqrt(Temp - 980)
## * Blasts --> log(0.1 + Blasts)
##### Checking for multicollinearity
## Are any of these variables redundant with the others?
library(car)
## Since vif() expects a model, we'll fit a model with all predictors,
## only for the purposes of checking multicollinearity
full.model <-
glm(Resp ~ Age + sqrt(Temp - 980) + Smear + Infil + Index + log(0.1 + Blasts),
family = "binomial", data = Train)
vif(full.model)
## Smear and Infil are fairly predictable from the others,
## but not up to VIF > 5
##### Looking at single predictor models
## The main purpose of this step is to assess whether we might want
## any polynomial terms
library(arm)
age.model <- glm(Resp ~ Age, family = "binomial", data = Train)
binnedplot(fitted(age.model),
residuals(age.model, type = "pearson"), nclass = 10)
## Might be a bit of nonlinearity, though it's a small dataset
temp.model <- glm(Resp ~ sqrt(Temp - 980), family = "binomial", data = Train)
binnedplot(fitted(temp.model),
residuals(temp.model, type = "pearson"), nclass = 10)
## Looks OK
smear.model <- glm(Resp ~ Smear, family = "binomial", data = Train)
binnedplot(fitted(smear.model),
residuals(smear.model, type = "pearson"), nclass = 10)
## Tough call
index.model <- glm(Resp ~ Index, family = "binomial", data = Train)
binnedplot(fitted(index.model),
residuals(index.model, type = "pearson"), nclass = 10)
## OK
infil.model <- glm(Resp ~ Infil, family = "binomial", data = Train)
binnedplot(fitted(infil.model),
residuals(infil.model, type = "pearson"), nclass = 10)
## Probably OK
blasts.model <- glm(Resp ~ log(0.1 + Blasts), family = "binomial", data = Train)
binnedplot(fitted(blasts.model),
residuals(blasts.model, type = "pearson"), nclass = 10)
## Could be some curvature, but probably OK
## So, we may want to consider adding Age^2 and Smear^2
## Note: Would be better to choose models using domain knowledge,
## but we'll use a "quick and dirty" method here for expediency
## (Best subsets not available in leaps package for logistic regression,
## so we'll use stepwise)
## Using poly() here forces the quadratic and linear terms to be considered
## together. We can consider removing quadratics later.
full.model <-
glm(Resp ~ poly(Age,2) + sqrt(Temp - 980) + poly(Smear,2) + Infil +
Index + log(0.1 + Blasts), family = "binomial", data = Train)
null.model <- glm(Resp ~ 1, family = "binomial", data = Train)
step(null.model, scope = list(upper = full.model))
###### No clear "cliff", so we'll consider top three models visited
formula1 <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2) + Infil
formula2 <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2)
formula3 <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2) + poly(Smear,2)
##### Check polynomial terms for possible removal
formula1b <- Resp ~ Index + sqrt(Temp - 980) + Age + Infil
model1 <- glm(formula1, family = "binomial", data = Train)
model1b <- glm(formula1b, family = "binomial", data = Train)
anova(model1b, model1, test = "LRT")
AIC(model1)
AIC(model1b)
## Ambiguous: AIC says "keep it"; but nested test yields P > 0.1
## How do the residuals look without the quadratic?
binnedplot(fitted(model1b), residuals(model1b, type = "pearson"))
## They look OK
## What about in the context of model 2?
formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age
model2 <- glm(formula2, family = "binomial", data = Train)
model2b <- glm(formula2b, family = "binomial", data = Train)
anova(model2b, model2, test = "LRT")
AIC(model2)
AIC(model2b)
## Again ambiguous, though the AIC edge for the more complex model is tiny
## Check the residuals without Age^2...
binnedplot(fitted(model2b), residuals(model2b, type = "pearson"))
## Looks pretty OK
## Tie-breaker goes to parsimony. Let's get rid of Age^2.
## What about the quadratics in model3?
formula3b <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2) + Smear
formula3c <- Resp ~ Index + sqrt(Temp - 980) + Age + poly(Smear, 2)
formula3d <- Resp ~ Index + sqrt(Temp - 980) + Age + Smear
model3 <- glm(formula3, family = "binomial", data = Train)
model3b <- glm(formula3b, family = "binomial", data = Train)
model3c <- glm(formula3c, family = "binomial", data = Train)
model3d <- glm(formula3d, family = "binomial", data = Train)
anova(model3b, model3, test = "LRT") # Checking Smear^2 if Age^2 is there
anova(model3d, model3c, test = "LRT") # Checking Smear^2 if Age^2 isn't there
anova(model3c, model3, test = "LRT") # Checking Age^2 if Smear^2 is there
anova(model3d, model3b, test = "LRT") # Checking Age^2 if Smear^2 isn't there
AIC(model3)
AIC(model3b)
AIC(model3c)
AIC(model3d)
## Seems clear we can safely discard Smear^2. Age^2 is less clear, since
## the lowest AIC model has it present, but none of the hypothesis tests
## suggest there is strong evidence for it
## Let's check binned residuals for model3d
binnedplot(fitted(model3d), residuals(model3d, type = "pearson"))
## They look OK, so we're again not worried about lack of linearity without Age^2
## So, we have revised our pool to
formula1b <- Resp ~ Index + sqrt(Temp - 980) + Age + Infil
formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age
formula3d <- Resp ~ Index + sqrt(Temp - 980) + Age + Smear
##### Checking for possible interactions
## One question we might have is whether the relationship between
## any of the biological markers and the response to treatment is
## modulated by Age. That is, is there an interaction between
## Age and any of the other variables?
formula4 <- Resp ~ (Index + sqrt(Temp - 980) + Infil + Smear) * Age
model4 <- glm(formula4, family = "binomial", data = Train)
## We can check combinations of interactions using backward elimination
## starting from model4
step(model4, direction = "backward")
## Looks like we can remove all of them. The best fitting models wind up
## being model 1b and model 3d, which we already have in our pool
## So our list at this point is
formula1b <- Resp ~ Index + sqrt(Temp - 980) + Age + Infil
formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age
formula3d <- Resp ~ Index + sqrt(Temp - 980) + Age + Smear
finalist.formulas <- c(formula1b, formula2b, formula3d)
models <- lapply(finalist.formulas, glm, family = "binomial", data = Train)
## Quick check the AICs
lapply(models, AIC)
## The narrow leader is model2b, which is also the simplest
###### Cross-Validation
source("http://colindawson.net/stat213/code/helper_functions.R")
source("~/213site/code/helper_functions.R")
#### Some error metrics that make sense for logistic regression:
### Normalized deviance: Residual deviance (on validation set) / sample size
### Can use normalized.deviance() from helper_functions.R
### "Hard" classification rate:
### Count prediction as an "error" if fitted probability
### is on the wrong side of 0.5.
### Compute proportion of validation cases that are errors
### Can use hard.classification.error() from helper_functions.R
### False positive rate: What proportion of real 0s are misclassified
### as 1s? (Ignores true 1s)
### helper_functions.R: false.positive.rate
### False negative rate: What proportion of real 1s are misclassified
### as 0s? (Ignores true 0s)
### helper_functions.R: false.negative.rate
### "Soft" classification error = absolute residual: |y - pi|
### Rewards confident correct predictions
### Penalizes confident wrong predictions
### Compute average error per case in validation set
### Can use soft.classification.error() from helper_functions.R
### Concordance, discordance, and Somers' D
### Consider all pairs of cases with different response values
## (that is, one 0 and one 1)
## Count what proportion of the time the 1 is assigned a higher
## predicted prob. than the zero (good). This is the "concordance"
## Count what proportion of the time the opposite occurs.
## This is the "discordance" (bad)
## Note that these may not add to 1, due to possible tied predicted probs.
## Somers' D := Concordance - Discordance
finalists <- lapply(finalist.formulas, glm, family = "binomial", data = Train)
set.seed(8675309)
n <- nrow(Train)
## Normalized deviances first
nds <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
error.metric = normalized.deviance, iterations = 10)
nds
## All three are really close
hces <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
error.metric = hard.classification.error, iterations = 10)
hces
## The leader is model 5 with 41.4% misclassification
## (which isn't that great)
fprs <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
error.metric = false.positive.rate, iterations = 10)
fprs
## Lots of false positives for all three, but fewest for model2b
fnrs <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
error.metric = false.negative.rate, iterations = 10)
fnrs
## No false negatives. Seems like the models tend to be "optimistic"
## about responding to treatment overall.
sces <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
error.metric = soft.classification.error, iterations = 10)
sces
## No difference
concordances <-
lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
error.metric = concordance, iterations = 10)
concordances
discordances <-
lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
error.metric = discordance, iterations = 10)
discordances
somers.ds <-
lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10,
error.metric = somers.d, iterations = 10)
somers.ds
## Model 2b looks best
##### Confirm on the test set
test.hces <-
lapply(finalists, get.held.out.error, test.set = Test,
response.name = "Resp", error.metric = hard.classification.error)
test.hces
test.fprs <-
lapply(finalists, get.held.out.error, test.set = Test,
response.name = "Resp", error.metric = false.positive.rate)
test.fprs
test.fnrs <-
lapply(finalists, get.held.out.error, test.set = Test,
response.name = "Resp", error.metric = false.negative.rate)
test.fnrs
test.sces <-
lapply(finalists, get.held.out.error, test.set = Test,
response.name = "Resp", error.metric = soft.classification.error)
test.sces
test.nds <-
lapply(finalists, get.held.out.error, test.set = Test,
response.name = "Resp", error.metric = normalized.deviance)
test.nds
test.somers.ds <-
lapply(finalists, get.held.out.error, test.set = Test,
response.name = "Resp", error.metric = somers.d)
test.somers.ds
## No difference
## There is very little basis to choose any one model over the others.
## Model 1b has the lowest test set deviance, while model 2b has the lowest
## cross-validation deviance; none of the models seem to do better than
## any other on the other measures. So, having no clear reason to include
## the extra predictor in model 1b, we'll go with model 2b as our "final model"
## (Note also that measuring Infil presumably costs money, and to use model 2b
## we don't need that information)
formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age
##### Fit on the full dataset and interpret
model2b.full <-
glm(Resp ~ Index + sqrt(Temp - 980) + Age, family = "binomial",
data = Leukemia)
model2b.full %>% coef() %>% round(2)
## Prediction equation:
## log odds(responds to treatment) = 0.81 + 0.35 Index
## - 0.60 sqrt(Temp - 980) - 0.05 Age
with(Leukemia, concordance(Resp, fitted(model2b.full, type = "response")))
## Odds and odds ratios:
model2b.full %>% coef() %>% exp() %>% round(2)
## It seems that higher Index scores are associated with a higher probability
## of responding to treatment, after controlling for age and body temperature,
## with the odds of response increasing by about 42% with each 1 point
## increase in Index relative to what is expected based on age and temperature
## It seems that higher body temperatures are associated with a lower
## chance of recovery (controlling for age and Index), but in the model
## the relationship flattens out some with higher temperatures.
## For each one unit increase in the square root of temperature
## in degrees above 98F relative to what is expected given age and index,
## the odds of responding to treatment go down 45%
## Older patients have worse odds of responding to treatment, controlling for
## body temperature and Index, with each additional
## year of age, relative to the average for body temperature and index,
## associated with a 5% downward adjustment in the odds of recovery.
## Each additional 10 years of age is associated with odds that are
odds.factor <- 0.95^10
odds.factor
## about `r round(100 * 1 - odds.factor)`% [40%] lower
##### Confidence intervals
## Let's look at the predicted recovery probabilities
## for a few example patients.
f.hat <- makeFun.logistic(model2b)
### Patient 1: 20 years old with a high index score and a normal body
### temperature (favorable scenario)
f.hat(Age = 20, Temp = 986, Index = 20, interval = "confidence")
## We estimate that a young patient with a high index score and a normal body temp
## has at least an 88% chance of responding to treatment.
### Patient 1b: same but with a fever of 101F
f.hat(Age = 20, Temp = 1010, Index = 20, interval = "confidence")
## If a patient like this has a fever, the probability of recovery might
## well still be near 100%, but it might be down to something like 73%
### Patient 2: 50 years old, moderate index score, normal body temp
f.hat(Age = 50, Temp = 986, Index = 10, interval = "confidence")
## Patients like this might have anywhere between a 36% and 79%
## chance of responding
### Patient 2b: fever of 101F
f.hat(Age = 50, Temp = 1010, Index = 10, interval = "confidence")
## The presence of a fever of 101F worsens the outlook considerably.
## The probability of responding goes down to somewhere between 4 and 50%
### Patient 3: 75 years old, fever of 101F, low index (unfavorable scenario)
f.hat(Age = 75, Temp = 1010, Index = 5, interval = "confidence")
## Even if our model is pessimistic, we doubt that the probability
## of responding to treatment is higher than 19% or so for a patient
## like this, and more likely it's less than 1%.