library("mosaic")
library("Stat2Data")
library("arm") # for binnedplot()
data("ICU")
?ICU
### Want to estimate patient survival probability
## Using Systolic BP
lin.BP.model <- glm(Survive ~ SysBP, data = ICU, family = "binomial")
plotModel(lin.BP.model)
binnedplot(fitted(lin.BP.model), residuals(lin.BP.model, type = "pearson"),
nclass = 25)
## Looks like might be some nonlinearity
## Maybe a quadratic?
quad.BP.model <-
glm(Survive ~ SysBP + I(SysBP^2), family = "binomial", data = ICU)
plotModel(quad.BP.model)
binnedplot(fitted(quad.BP.model), residuals(quad.BP.model, type = "pearson"), nclass = 25)
## Not a whole lot better, tbh
summary(quad.BP.model)
## Based on z test, quadratic is helpful, though
## Look at the residuals for the first order model
dresids <- residuals(lin.BP.model, type = "deviance")
histogram(dresids, breaks = 30)
## They look OK, except for one
presids <- residuals(lin.BP.model, type = "pearson")
histogram(presids, breaks = 30)
## Oof
plot(presids)
## oof
## Lets us click on a point and identify it (don't run this in Markdown)
identify(presids)
## Let's see what it is
ICU[196,]
## SysBP is pretty high... Where is it in the distribution?
favstats(~SysBP, data = ICU)
bwplot(~SysBP, data = ICU)
## Looks like it's the largest predictor value, on top of
## having the largest residual. That's a lot of influence!
## Cook's distance (note: doesn't measure influence exactly right
## in logistic regression)
plot(lin.BP.model, which = 4)
## Try fitting the models without it
lin.BP.model2 <-
glm(Survive ~ SysBP, data = filter(ICU, ID != 921), family = "binomial")
plotModel(lin.BP.model)
quad.BP.model2 <-
glm(Survive ~ SysBP + I(SysBP^2), family = "binomial", data = filter(ICU, ID != 921))
plotModel(quad.BP.model2)
## Removing the outlier makes a huge difference to the RHS of the curve
summary(quad.BP.model2)
## Quadratic no longer significant