Model: Formula
Using the data set fat from the library faraway, we will first fit a simple linear regression of Brozek’s body fat percentage on age.
library(faraway)
with(fat,plot(age, brozek, xlab="Age (years)", ylab = "Percent body fat using Brozek's equation"))
bfat.lm1 <- lm(brozek ~ age, data=fat)
summary(bfat.lm1)
##
## Call:
## lm(formula = brozek ~ age, data = fat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.0697 -5.7025 0.2846 4.8301 25.0739
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.95546 1.73576 6.312 1.25e-09 ***
## age 0.17786 0.03724 4.776 3.04e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.435 on 250 degrees of freedom
## Multiple R-squared: 0.08362, Adjusted R-squared: 0.07996
## F-statistic: 22.81 on 1 and 250 DF, p-value: 3.045e-06
plot(bfat.lm1)
Let’s add some more variables to the regression equation.
bfat.lm2 <- lm(brozek ~ age + weight + height + abdom, data=fat)
with(fat,pairs(~brozek + age + weight + height + abdom))
summary(bfat.lm2)
##
## Call:
## lm(formula = brozek ~ age + weight + height + abdom, data = fat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.5105 -2.9346 0.0087 2.8942 9.4179
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -32.769636 6.541902 -5.009 1.04e-06 ***
## age -0.007051 0.024342 -0.290 0.772
## weight -0.123722 0.025046 -4.940 1.44e-06 ***
## height -0.116694 0.082727 -1.411 0.160
## abdom 0.889704 0.067267 13.226 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.126 on 247 degrees of freedom
## Multiple R-squared: 0.7211, Adjusted R-squared: 0.7166
## F-statistic: 159.7 on 4 and 247 DF, p-value: < 2.2e-16
plot(bfat.lm2)
head(fitted(bfat.lm2)) #fiited values (y-hat)
## 1 2 3 4 5 6
## 15.88088 11.05474 18.49610 12.62873 24.92136 16.31382
head(residuals(bfat.lm2)) #residuals
## 1 2 3 4 5 6
## -3.280875 -4.154741 6.103901 -1.728729 2.878638 4.286176
head(confint(bfat.lm2)) #confidence intervals fot he betas
## 2.5 % 97.5 %
## (Intercept) -45.65466337 -19.88460834
## age -0.05499491 0.04089239
## weight -0.17305183 -0.07439172
## height -0.27963414 0.04624622
## abdom 0.75721358 1.02219461
head(predict(bfat.lm2)) #fitted values, but can also be used to make new predictions from the model
## 1 2 3 4 5 6
## 15.88088 11.05474 18.49610 12.62873 24.92136 16.31382
predict(bfat.lm2, data.frame(age=23, weight=154.25, height = 67.75, abdom=85.2)) #calculated prediction equation for new data in data frame
## 1
## 15.88088
Let’s come back to the pima indians diabetes study. We will look at the regression of glucose on age between the two groups of those with and without diabetes.
pima.lm <- lm(glucose ~ age*factor(test), data=pima)
summary(pima.lm)
##
## Call:
## lm(formula = glucose ~ age * factor(test), data = pima)
##
## Residuals:
## Min 1Q Median 3Q Max
## -142.39 -17.69 -1.85 17.24 87.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94.0460 3.5592 26.424 < 2e-16 ***
## age 0.5109 0.1069 4.779 2.11e-06 ***
## factor(test)1 36.5724 6.9832 5.237 2.11e-07 ***
## age:factor(test)1 -0.2238 0.1887 -1.187 0.236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27.86 on 764 degrees of freedom
## Multiple R-squared: 0.2437, Adjusted R-squared: 0.2407
## F-statistic: 82.05 on 3 and 764 DF, p-value: < 2.2e-16
plot(pima.lm)
qqnorm(residuals(pima.lm))
shapiro.test(residuals(pima.lm))
##
## Shapiro-Wilk normality test
##
## data: residuals(pima.lm)
## W = 0.97682, p-value = 1.1e-09
ks.test(residuals(pima.lm),pnorm)
## Warning in ks.test(residuals(pima.lm), pnorm): ties should not be present
## for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: residuals(pima.lm)
## D = 0.48812, p-value < 2.2e-16
## alternative hypothesis: two-sided
pima.lm0 <- lm(glucose ~ age, data=pima, subset=(test==0))
pima.lm1 <- lm(glucose ~ age, data=pima, subset=(test==1))
with(pima,plot(age,glucose, col=ifelse(test,"red","blue")))
with(pima,lines(age[test==0],predict(pima.lm0,data.frame(age=age[test==0])),lty=1,col="blue"))
with(pima,lines(age[test==1],predict(pima.lm1,data.frame(age=age[test==1])),lty=2,col="red"))
The R function step uses AIC to choose the best model at each step (lower is better with AIC). Forward selection:
bfat.null <- lm(brozek ~ 1, data=fat)
step(bfat.null, scope=list(lower=bfat.null, upper=bfat.lm2), direction="forward")
## Start: AIC=1033.09
## brozek ~ 1
##
## Df Sum of Sq RSS AIC
## + abdom 1 9984.1 5094.9 761.66
## + weight 1 5669.1 9409.9 916.26
## + age 1 1260.9 13818.1 1013.08
## + height 1 119.7 14959.3 1033.08
## <none> 15079.0 1033.09
##
## Step: AIC=761.66
## brozek ~ abdom
##
## Df Sum of Sq RSS AIC
## + weight 1 853.60 4241.3 717.45
## + height 1 391.75 4703.2 743.49
## + age 1 164.67 4930.3 755.38
## <none> 5094.9 761.66
##
## Step: AIC=717.45
## brozek ~ abdom + weight
##
## Df Sum of Sq RSS AIC
## + height 1 34.862 4206.5 717.37
## <none> 4241.3 717.45
## + age 1 2.416 4238.9 719.30
##
## Step: AIC=717.37
## brozek ~ abdom + weight + height
##
## Df Sum of Sq RSS AIC
## <none> 4206.5 717.37
## + age 1 1.4286 4205.0 719.28
##
## Call:
## lm(formula = brozek ~ abdom + weight + height, data = fat)
##
## Coefficients:
## (Intercept) abdom weight height
## -32.6625 0.8798 -0.1204 -0.1182
Backward Selection
step(bfat.lm2,scope=list(lower=bfat.null, upper=bfat.lm2), direction="backward")
## Start: AIC=719.28
## brozek ~ age + weight + height + abdom
##
## Df Sum of Sq RSS AIC
## - age 1 1.43 4206.5 717.37
## <none> 4205.0 719.28
## - height 1 33.87 4238.9 719.30
## - weight 1 415.44 4620.5 741.02
## - abdom 1 2978.22 7183.3 852.22
##
## Step: AIC=717.37
## brozek ~ weight + height + abdom
##
## Df Sum of Sq RSS AIC
## <none> 4206.5 717.37
## - height 1 34.9 4241.3 717.45
## - weight 1 496.7 4703.2 743.49
## - abdom 1 3914.5 8121.0 881.14
##
## Call:
## lm(formula = brozek ~ weight + height + abdom, data = fat)
##
## Coefficients:
## (Intercept) weight height abdom
## -32.6625 -0.1204 -0.1182 0.8798
step(bfat.lm2,scope=list(lower=bfat.null, upper=bfat.lm2), direction="both")
## Start: AIC=719.28
## brozek ~ age + weight + height + abdom
##
## Df Sum of Sq RSS AIC
## - age 1 1.43 4206.5 717.37
## <none> 4205.0 719.28
## - height 1 33.87 4238.9 719.30
## - weight 1 415.44 4620.5 741.02
## - abdom 1 2978.22 7183.3 852.22
##
## Step: AIC=717.37
## brozek ~ weight + height + abdom
##
## Df Sum of Sq RSS AIC
## <none> 4206.5 717.37
## - height 1 34.9 4241.3 717.45
## + age 1 1.4 4205.0 719.28
## - weight 1 496.7 4703.2 743.49
## - abdom 1 3914.5 8121.0 881.14
##
## Call:
## lm(formula = brozek ~ weight + height + abdom, data = fat)
##
## Coefficients:
## (Intercept) weight height abdom
## -32.6625 -0.1204 -0.1182 0.8798