ANOVA

dep <- read.table("H:\\BiostatCourses\\PublicHealthComputing\\Lectures\\Week8ANOVA_SLR\\dep.csv",sep=",")
colnames(dep) <- c("dep","type","gend")
dep$gend <- factor(dep$gend, levels=c(0,1), labels=c("Female", "Male"))
dep
##    dep  type   gend
## 1   75 Admin   Male
## 2   73 Admin Female
## 3   68 Admin Female
## 4  109 Admin   Male
## 5   92 Admin   Male
## 6   82 Admin Female
## 7   33 Admin Female
## 8  135 Admin Female
## 9   69 Labor Female
## 10 161 Labor Female
## 11  91 Labor   Male
## 12  80 Labor   Male
## 13 198 Labor Female
## 14 194 Labor   Male
## 15  94 Labor Female
## 16 126 Labor   Male
## 17 184 Labor Female
## 18 141 Labor   Male
## 19 108 Labor Female
## 20 175 Labor Female
## 21 126 Labor Female
## 22  35  Tech Female
## 23  86  Tech   Male
## 24 202  Tech   Male
## 25 213  Tech Female
## 26  82  Tech Female
## 27 156  Tech   Male
## 28 170  Tech Female
## 29 188  Tech   Male
## 30  37  Tech Female
## 31 294  Tech   Male
## 32  92  Tech Female
## 33 232  Tech Female
## 34 238  Tech Female
## 35 112  Tech   Male
## 36  87  Tech   Male
## 37  73  Tech Female
## 38 168  Tech Female
## 39 218  Tech   Male
boxplot(dep$dep ~ dep$type)

dep.aov <- aov(dep ~ type, data=dep)
summary(dep.aov) #F test
##             Df Sum Sq Mean Sq F value Pr(>F)  
## type         2  24158   12079   3.384  0.045 *
## Residuals   36 128506    3570                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(dep.aov) #all pairwise comparison controlling type I error
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = dep ~ type, data = dep)
## 
## $type
##                 diff        lwr       upr     p adj
## Labor-Admin 51.00962 -14.613561 116.63279 0.1533089
## Tech-Admin  65.68056   3.626568 127.73454 0.0360894
## Tech-Labor  14.67094 -38.483211  67.82509 0.7796150
summary.lm(dep.aov) #get estimated means
## 
## Call:
## aov(formula = dep ~ type, data = dep)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -114.056  -46.880   -1.375   45.115  144.944 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    83.37      21.12   3.947 0.000352 ***
## typeLabor      51.01      26.85   1.900 0.065469 .  
## typeTech       65.68      25.39   2.587 0.013864 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.75 on 36 degrees of freedom
## Multiple R-squared:  0.1582, Adjusted R-squared:  0.1115 
## F-statistic: 3.384 on 2 and 36 DF,  p-value: 0.04501

The anova function can be used to compare models to run tests of parameters being zero. This allows us to run tests that we used contrasts to do in SAS.

#Tech vs Admin
dep$VL <- as.numeric(dep$type == "Labor")
dep$VT <- as.numeric(dep$type == "Tech")
dep.model1 <- lm(dep ~ VL + VT, data=dep)
dep.model2 <- lm(dep ~ VL, data=dep)

anova(dep.model2, dep.model1) #H_0: beta_T = 0
## Analysis of Variance Table
## 
## Model 1: dep ~ VL
## Model 2: dep ~ VL + VT
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     37 152398                              
## 2     36 128506  1     23893 6.6933 0.01386 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Tech vs Labor
dep$VLT <- as.numeric((dep$type == "Labor") | (dep$type == "Tech"))
dep.model3 <- lm(dep ~ VLT, data=dep)

anova(dep.model3, dep.model1) # H_0: beta_T=beta_L
## Analysis of Variance Table
## 
## Model 1: dep ~ VLT
## Model 2: dep ~ VL + VT
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     37 130131                           
## 2     36 128506  1    1624.7 0.4551 0.5042
drop1(dep.model1, test="F") #Run single test of all parameters equal to 0 one by one
## Single term deletions
## 
## Model:
## dep ~ VL + VT
##        Df Sum of Sq    RSS    AIC F value  Pr(>F)  
## <none>              128506 321.91                  
## VL      1     12886 141392 323.63  3.6099 0.06547 .
## VT      1     23893 152398 326.56  6.6933 0.01386 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Two way ANOVA

boxplot(dep ~ gend + type, data=dep, col=c("red", "blue"))

dep.2way <- aov(dep ~ gend*type, data=dep)
summary(dep.2way)
##             Df Sum Sq Mean Sq F value Pr(>F)  
## gend         1   2710    2710   0.730 0.3991  
## type         2  23431   11716   3.155 0.0557 .
## gend:type    2   3992    1996   0.538 0.5892  
## Residuals   33 122531    3713                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.lm(dep.2way)
## 
## Call:
## aov(formula = dep ~ gend * type, data = dep)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -99.00 -45.29  -0.40  35.81 126.12 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           78.20      27.25   2.870  0.00712 **
## gendMale              13.80      44.50   0.310  0.75843   
## typeLabor             61.18      34.74   1.761  0.08750 . 
## typeTech              55.80      33.38   1.672  0.10400   
## gendMale:typeLabor   -26.77      56.45  -0.474  0.63842   
## gendMale:typeTech     20.08      53.06   0.378  0.70762   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 60.93 on 33 degrees of freedom
## Multiple R-squared:  0.1974, Adjusted R-squared:  0.07578 
## F-statistic: 1.623 on 5 and 33 DF,  p-value: 0.1813

Note that the interaction is not significant, so we can drop this term and test for main effects.

dep.main <- aov(dep ~ type + gend, data=dep)
summary(dep.main)
##             Df Sum Sq Mean Sq F value Pr(>F)  
## type         2  24158   12079   3.341  0.047 *
## gend         1   1983    1983   0.549  0.464  
## Residuals   35 126523    3615                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Note that the gender effect is not significant, but type is. Now let’s look at simple effects. Simple effects is what you would test instead of main effects if the interaction was significant.

library(phia)
## Warning: package 'phia' was built under R version 3.4.2
## Loading required package: car
## Warning: package 'car' was built under R version 3.4.2
testInteractions(dep.2way, fixed="type", across="gend", adjustment = "none") #test for gender effect within each level of type
## F Test: 
## P-value adjustment method: none
##             Value Df Sum of Sq      F Pr(>F)
## Admin     -13.800  1       357 0.0962 0.7584
## Labor      12.975  1       518 0.1395 0.7112
##  Tech     -33.875  1      5100 1.3736 0.2496
## Residuals         33    122531
testInteractions(dep.2way, fixed="gend", across="type", adjustment="none") #test for type effect for each gender
## F Test: 
## P-value adjustment method: none
##             type1   type2 Df Sum of Sq      F Pr(>F)
## Female    -55.800   5.375  2     13378 1.8015 0.1809
##   Male    -75.875 -41.475  2     14045 1.8913 0.1669
## Residuals                 33    122531

Nonparametric ANOVA

For comparing two groups, we use the nonparametric version of the t-test which is the Wilcoxon Mann Whitney test.

wilcox.test(dep ~ gend, data=dep)
## Warning in wilcox.test.default(x = c(73L, 68L, 82L, 33L, 135L, 69L, 161L, :
## cannot compute exact p-value with ties
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  dep by gend
## W = 151, p-value = 0.3533
## alternative hypothesis: true location shift is not equal to 0

For more than two groups, we use the Kruskal Wallis test.

kruskal.test(dep ~ type, data=dep)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  dep by type
## Kruskal-Wallis chi-squared = 6.4713, df = 2, p-value = 0.03934

Simple Linear Regression

Scatterplots - plot

with(Orange, plot(age,circumference, xlab="Age (days since 12/31/1968",
                  ylab = "Trunk Cirucmference (mm)"))

Correlation - cor, cor.test for hypothesis test and CI

with(Orange, cor(age,circumference, method="pearson"))
## [1] 0.9135189
with(Orange, cor.test(age,circumference, method="pearson"))
## 
##  Pearson's product-moment correlation
## 
## data:  age and circumference
## t = 12.9, df = 33, p-value = 1.931e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8342364 0.9557955
## sample estimates:
##       cor 
## 0.9135189
with(Orange, cor(age,circumference, method="spearman"))
## [1] 0.9064294
with(Orange, cor.test(age,circumference, method="spearman"))
## Warning in cor.test.default(age, circumference, method = "spearman"):
## Cannot compute exact p-value with ties
## 
##  Spearman's rank correlation rho
## 
## data:  age and circumference
## S = 668.09, p-value = 6.712e-14
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.9064294
orange.lm <- lm(circumference ~ age, data=Orange)
summary(orange.lm)
## 
## Call:
## lm(formula = circumference ~ age, data = Orange)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.310 -14.946  -0.076  19.697  45.111 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 17.399650   8.622660   2.018   0.0518 .  
## age          0.106770   0.008277  12.900 1.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23.74 on 33 degrees of freedom
## Multiple R-squared:  0.8345, Adjusted R-squared:  0.8295 
## F-statistic: 166.4 on 1 and 33 DF,  p-value: 1.931e-14
plot(orange.lm)