library("Stat2Data") library("mosaic") data("Leukemia") ###### We are trying to select a model to predict ###### Resp: will a Leukemia patient respond to treatment? (1 = yes, 0 = no) ###### from a set of diagnostic and vitals results ##### Setting aside a "test set" set.seed(29747) # Don't forget to do this! n <- nrow(Leukemia) test.size <- round(0.20 * n) test.size to.hold.out <- rep(0:1, times = c(n - test.size, test.size)) %>% sample() Test <- filter(Leukemia, to.hold.out == 1) Train <- filter(Leukemia, to.hold.out == 0) head(Test) ## We'll restrict models to the following predictors: ## * Age (age in years) ## * Temp (highest temperature in 10 * degrees F) ## * Smear (differential percentage of "blasts") ## * Infil (percent of marrow leukemia infiltrate) ## * Index (percentage labeling index of bone marrow leukemia cells) ## * Blasts (absolute number of blasts in thousands) ## NB: it's not a good idea to jump into modeling in a domain ## that you don't have any background in, especially when some of ## the variables are quite technical in nature. I chose this example ## because it has a little bit of "messiness", but not too much. ## But you wouldn't want to do this for a project unless you know ## about Leukemia, or at least about these metrics. ##### Checking for transformations ## We'll check individual (quantitative) ## variables' distributions first to see whether ## any obviously call for a transformation bwplot(~Age, data = Train) # reasonably symmetric histogram(~Age, data = Train) bwplot(~Temp, data = Train) # kind of right skewed bwplot(~sqrt(Temp), data = Train) # didn't really help bwplot(~log(Temp), data = Train) # also didn't help bwplot(~sqrt(Temp - 980), data = Train) # transformation has more histogram(~sqrt(Temp - 980), data = Train) # impact near 0 bwplot(~Smear, data = Train) # not bad histogram(~Smear, data = Train) bwplot(~Infil, data = Train) # bit left skewed, but not extreme histogram(~Infil, data = Train) bwplot(~Index, data = Train) # pretty reasonable histogram(~Index, data = Train) bwplot(~Blasts, data = Train) # that's a lot of skew histogram(~Blasts, data = Train) # looks structural, not just outliers with(Train, plot(Blasts)) with(Train, identify(Blasts)) ## Would like a log transform; are there zeroes? min(~Blasts, data = Train) # yup bwplot(~log(1 + Blasts), data = Train) # better, but still skewed bwplot(~log(0.5 + Blasts), data = Train) # better still bwplot(~log(0.1 + Blasts), data = Train) # still better histogram(~log(0.1 + Blasts), data = Train) ## So we have chosen the following transformations: ## * Temp --> sqrt(Temp - 980) ## * Blasts --> log(0.1 + Blasts) ##### Checking for multicollinearity ## Are any of these variables redundant with the others? library(car) ## Since vif() expects a model, we'll fit a model with all predictors, ## only for the purposes of checking multicollinearity full.model <- glm(Resp ~ Age + sqrt(Temp - 980) + Smear + Infil + Index + log(0.1 + Blasts), family = "binomial", data = Train) vif(full.model) ## Smear and Infil are fairly predictable from the others, ## but not up to VIF > 5 ##### Looking at single predictor models ## The main purpose of this step is to assess whether we might want ## any polynomial terms library(arm) age.model <- glm(Resp ~ Age, family = "binomial", data = Train) binnedplot(fitted(age.model), residuals(age.model, type = "pearson"), nclass = 10) ## Might be a bit of nonlinearity, though it's a small dataset temp.model <- glm(Resp ~ sqrt(Temp - 980), family = "binomial", data = Train) binnedplot(fitted(temp.model), residuals(temp.model, type = "pearson"), nclass = 10) ## Looks OK smear.model <- glm(Resp ~ Smear, family = "binomial", data = Train) binnedplot(fitted(smear.model), residuals(smear.model, type = "pearson"), nclass = 10) ## Tough call index.model <- glm(Resp ~ Index, family = "binomial", data = Train) binnedplot(fitted(index.model), residuals(index.model, type = "pearson"), nclass = 10) ## OK infil.model <- glm(Resp ~ Infil, family = "binomial", data = Train) binnedplot(fitted(infil.model), residuals(infil.model, type = "pearson"), nclass = 10) ## Probably OK blasts.model <- glm(Resp ~ log(0.1 + Blasts), family = "binomial", data = Train) binnedplot(fitted(blasts.model), residuals(blasts.model, type = "pearson"), nclass = 10) ## Could be some curvature, but probably OK ## So, we may want to consider adding Age^2 and Smear^2 ## Note: Would be better to choose models using domain knowledge, ## but we'll use a "quick and dirty" method here for expediency ## (Best subsets not available in leaps package for logistic regression, ## so we'll use stepwise) ## Using poly() here forces the quadratic and linear terms to be considered ## together. We can consider removing quadratics later. full.model <- glm(Resp ~ poly(Age,2) + sqrt(Temp - 980) + poly(Smear,2) + Infil + Index + log(0.1 + Blasts), family = "binomial", data = Train) null.model <- glm(Resp ~ 1, family = "binomial", data = Train) step(null.model, scope = list(upper = full.model)) ###### No clear "cliff", so we'll consider top three models visited formula1 <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2) + Infil formula2 <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2) formula3 <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2) + poly(Smear,2) ##### Check polynomial terms for possible removal formula1b <- Resp ~ Index + sqrt(Temp - 980) + Age + Infil model1 <- glm(formula1, family = "binomial", data = Train) model1b <- glm(formula1b, family = "binomial", data = Train) anova(model1b, model1, test = "LRT") AIC(model1) AIC(model1b) ## Ambiguous: AIC says "keep it"; but nested test yields P > 0.1 ## How do the residuals look without the quadratic? binnedplot(fitted(model1b), residuals(model1b, type = "pearson")) ## They look OK ## What about in the context of model 2? formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age model2 <- glm(formula2, family = "binomial", data = Train) model2b <- glm(formula2b, family = "binomial", data = Train) anova(model2b, model2, test = "LRT") AIC(model2) AIC(model2b) ## Again ambiguous, though the AIC edge for the more complex model is tiny ## Check the residuals without Age^2... binnedplot(fitted(model2b), residuals(model2b, type = "pearson")) ## Looks pretty OK ## Tie-breaker goes to parsimony. Let's get rid of Age^2. ## What about the quadratics in model3? formula3b <- Resp ~ Index + sqrt(Temp - 980) + poly(Age,2) + Smear formula3c <- Resp ~ Index + sqrt(Temp - 980) + Age + poly(Smear, 2) formula3d <- Resp ~ Index + sqrt(Temp - 980) + Age + Smear model3 <- glm(formula3, family = "binomial", data = Train) model3b <- glm(formula3b, family = "binomial", data = Train) model3c <- glm(formula3c, family = "binomial", data = Train) model3d <- glm(formula3d, family = "binomial", data = Train) anova(model3b, model3, test = "LRT") # Checking Smear^2 if Age^2 is there anova(model3d, model3c, test = "LRT") # Checking Smear^2 if Age^2 isn't there anova(model3c, model3, test = "LRT") # Checking Age^2 if Smear^2 is there anova(model3d, model3b, test = "LRT") # Checking Age^2 if Smear^2 isn't there AIC(model3) AIC(model3b) AIC(model3c) AIC(model3d) ## Seems clear we can safely discard Smear^2. Age^2 is less clear, since ## the lowest AIC model has it present, but none of the hypothesis tests ## suggest there is strong evidence for it ## Let's check binned residuals for model3d binnedplot(fitted(model3d), residuals(model3d, type = "pearson")) ## They look OK, so we're again not worried about lack of linearity without Age^2 ## So, we have revised our pool to formula1b <- Resp ~ Index + sqrt(Temp - 980) + Age + Infil formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age formula3d <- Resp ~ Index + sqrt(Temp - 980) + Age + Smear ##### Checking for possible interactions ## One question we might have is whether the relationship between ## any of the biological markers and the response to treatment is ## modulated by Age. That is, is there an interaction between ## Age and any of the other variables? formula4 <- Resp ~ (Index + sqrt(Temp - 980) + Infil + Smear) * Age model4 <- glm(formula4, family = "binomial", data = Train) ## We can check combinations of interactions using backward elimination ## starting from model4 step(model4, direction = "backward") ## Looks like we can remove all of them. The best fitting models wind up ## being model 1b and model 3d, which we already have in our pool ## So our list at this point is formula1b <- Resp ~ Index + sqrt(Temp - 980) + Age + Infil formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age formula3d <- Resp ~ Index + sqrt(Temp - 980) + Age + Smear finalist.formulas <- c(formula1b, formula2b, formula3d) models <- lapply(finalist.formulas, glm, family = "binomial", data = Train) ## Quick check the AICs lapply(models, AIC) ## The narrow leader is model2b, which is also the simplest ###### Cross-Validation source("http://colindawson.net/stat213/code/helper_functions.R") source("~/213site/code/helper_functions.R") #### Some error metrics that make sense for logistic regression: ### Normalized deviance: Residual deviance (on validation set) / sample size ### Can use normalized.deviance() from helper_functions.R ### "Hard" classification rate: ### Count prediction as an "error" if fitted probability ### is on the wrong side of 0.5. ### Compute proportion of validation cases that are errors ### Can use hard.classification.error() from helper_functions.R ### False positive rate: What proportion of real 0s are misclassified ### as 1s? (Ignores true 1s) ### helper_functions.R: false.positive.rate ### False negative rate: What proportion of real 1s are misclassified ### as 0s? (Ignores true 0s) ### helper_functions.R: false.negative.rate ### "Soft" classification error = absolute residual: |y - pi| ### Rewards confident correct predictions ### Penalizes confident wrong predictions ### Compute average error per case in validation set ### Can use soft.classification.error() from helper_functions.R ### Concordance, discordance, and Somers' D ### Consider all pairs of cases with different response values ## (that is, one 0 and one 1) ## Count what proportion of the time the 1 is assigned a higher ## predicted prob. than the zero (good). This is the "concordance" ## Count what proportion of the time the opposite occurs. ## This is the "discordance" (bad) ## Note that these may not add to 1, due to possible tied predicted probs. ## Somers' D := Concordance - Discordance finalists <- lapply(finalist.formulas, glm, family = "binomial", data = Train) set.seed(8675309) n <- nrow(Train) ## Normalized deviances first nds <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10, error.metric = normalized.deviance, iterations = 10) nds ## All three are really close hces <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10, error.metric = hard.classification.error, iterations = 10) hces ## The leader is model 5 with 41.4% misclassification ## (which isn't that great) fprs <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10, error.metric = false.positive.rate, iterations = 10) fprs ## Lots of false positives for all three, but fewest for model2b fnrs <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10, error.metric = false.negative.rate, iterations = 10) fnrs ## No false negatives. Seems like the models tend to be "optimistic" ## about responding to treatment overall. sces <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10, error.metric = soft.classification.error, iterations = 10) sces ## No difference concordances <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10, error.metric = concordance, iterations = 10) concordances discordances <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10, error.metric = discordance, iterations = 10) discordances somers.ds <- lapply(finalists, do.k.fold.cv, data = Train, num.folds = 10, error.metric = somers.d, iterations = 10) somers.ds ## Model 2b looks best ##### Confirm on the test set test.hces <- lapply(finalists, get.held.out.error, test.set = Test, response.name = "Resp", error.metric = hard.classification.error) test.hces test.fprs <- lapply(finalists, get.held.out.error, test.set = Test, response.name = "Resp", error.metric = false.positive.rate) test.fprs test.fnrs <- lapply(finalists, get.held.out.error, test.set = Test, response.name = "Resp", error.metric = false.negative.rate) test.fnrs test.sces <- lapply(finalists, get.held.out.error, test.set = Test, response.name = "Resp", error.metric = soft.classification.error) test.sces test.nds <- lapply(finalists, get.held.out.error, test.set = Test, response.name = "Resp", error.metric = normalized.deviance) test.nds test.somers.ds <- lapply(finalists, get.held.out.error, test.set = Test, response.name = "Resp", error.metric = somers.d) test.somers.ds ## No difference ## There is very little basis to choose any one model over the others. ## Model 1b has the lowest test set deviance, while model 2b has the lowest ## cross-validation deviance; none of the models seem to do better than ## any other on the other measures. So, having no clear reason to include ## the extra predictor in model 1b, we'll go with model 2b as our "final model" ## (Note also that measuring Infil presumably costs money, and to use model 2b ## we don't need that information) formula2b <- Resp ~ Index + sqrt(Temp - 980) + Age ##### Fit on the full dataset and interpret model2b.full <- glm(Resp ~ Index + sqrt(Temp - 980) + Age, family = "binomial", data = Leukemia) model2b.full %>% coef() %>% round(2) ## Prediction equation: ## log odds(responds to treatment) = 0.81 + 0.35 Index ## - 0.60 sqrt(Temp - 980) - 0.05 Age with(Leukemia, concordance(Resp, fitted(model2b.full, type = "response"))) ## Odds and odds ratios: model2b.full %>% coef() %>% exp() %>% round(2) ## It seems that higher Index scores are associated with a higher probability ## of responding to treatment, after controlling for age and body temperature, ## with the odds of response increasing by about 42% with each 1 point ## increase in Index relative to what is expected based on age and temperature ## It seems that higher body temperatures are associated with a lower ## chance of recovery (controlling for age and Index), but in the model ## the relationship flattens out some with higher temperatures. ## For each one unit increase in the square root of temperature ## in degrees above 98F relative to what is expected given age and index, ## the odds of responding to treatment go down 45% ## Older patients have worse odds of responding to treatment, controlling for ## body temperature and Index, with each additional ## year of age, relative to the average for body temperature and index, ## associated with a 5% downward adjustment in the odds of recovery. ## Each additional 10 years of age is associated with odds that are odds.factor <- 0.95^10 odds.factor ## about `r round(100 * 1 - odds.factor)`% [40%] lower ##### Confidence intervals ## Let's look at the predicted recovery probabilities ## for a few example patients. f.hat <- makeFun.logistic(model2b) ### Patient 1: 20 years old with a high index score and a normal body ### temperature (favorable scenario) f.hat(Age = 20, Temp = 986, Index = 20, interval = "confidence") ## We estimate that a young patient with a high index score and a normal body temp ## has at least an 88% chance of responding to treatment. ### Patient 1b: same but with a fever of 101F f.hat(Age = 20, Temp = 1010, Index = 20, interval = "confidence") ## If a patient like this has a fever, the probability of recovery might ## well still be near 100%, but it might be down to something like 73% ### Patient 2: 50 years old, moderate index score, normal body temp f.hat(Age = 50, Temp = 986, Index = 10, interval = "confidence") ## Patients like this might have anywhere between a 36% and 79% ## chance of responding ### Patient 2b: fever of 101F f.hat(Age = 50, Temp = 1010, Index = 10, interval = "confidence") ## The presence of a fever of 101F worsens the outlook considerably. ## The probability of responding goes down to somewhere between 4 and 50% ### Patient 3: 75 years old, fever of 101F, low index (unfavorable scenario) f.hat(Age = 75, Temp = 1010, Index = 5, interval = "confidence") ## Even if our model is pessimistic, we doubt that the probability ## of responding to treatment is higher than 19% or so for a patient ## like this, and more likely it's less than 1%.