dep <- read.table("H:\\BiostatCourses\\PublicHealthComputing\\Lectures\\Week8ANOVA_SLR\\dep.csv",sep=",")
colnames(dep) <- c("dep","type","gend")
dep$gend <- factor(dep$gend, levels=c(0,1), labels=c("Female", "Male"))
dep
## dep type gend
## 1 75 Admin Male
## 2 73 Admin Female
## 3 68 Admin Female
## 4 109 Admin Male
## 5 92 Admin Male
## 6 82 Admin Female
## 7 33 Admin Female
## 8 135 Admin Female
## 9 69 Labor Female
## 10 161 Labor Female
## 11 91 Labor Male
## 12 80 Labor Male
## 13 198 Labor Female
## 14 194 Labor Male
## 15 94 Labor Female
## 16 126 Labor Male
## 17 184 Labor Female
## 18 141 Labor Male
## 19 108 Labor Female
## 20 175 Labor Female
## 21 126 Labor Female
## 22 35 Tech Female
## 23 86 Tech Male
## 24 202 Tech Male
## 25 213 Tech Female
## 26 82 Tech Female
## 27 156 Tech Male
## 28 170 Tech Female
## 29 188 Tech Male
## 30 37 Tech Female
## 31 294 Tech Male
## 32 92 Tech Female
## 33 232 Tech Female
## 34 238 Tech Female
## 35 112 Tech Male
## 36 87 Tech Male
## 37 73 Tech Female
## 38 168 Tech Female
## 39 218 Tech Male
boxplot(dep$dep ~ dep$type)
dep.aov <- aov(dep ~ type, data=dep)
summary(dep.aov) #F test
## Df Sum Sq Mean Sq F value Pr(>F)
## type 2 24158 12079 3.384 0.045 *
## Residuals 36 128506 3570
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(dep.aov) #all pairwise comparison controlling type I error
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = dep ~ type, data = dep)
##
## $type
## diff lwr upr p adj
## Labor-Admin 51.00962 -14.613561 116.63279 0.1533089
## Tech-Admin 65.68056 3.626568 127.73454 0.0360894
## Tech-Labor 14.67094 -38.483211 67.82509 0.7796150
summary.lm(dep.aov) #get estimated means
##
## Call:
## aov(formula = dep ~ type, data = dep)
##
## Residuals:
## Min 1Q Median 3Q Max
## -114.056 -46.880 -1.375 45.115 144.944
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.37 21.12 3.947 0.000352 ***
## typeLabor 51.01 26.85 1.900 0.065469 .
## typeTech 65.68 25.39 2.587 0.013864 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 59.75 on 36 degrees of freedom
## Multiple R-squared: 0.1582, Adjusted R-squared: 0.1115
## F-statistic: 3.384 on 2 and 36 DF, p-value: 0.04501
The anova function can be used to compare models to run tests of parameters being zero. This allows us to run tests that we used contrasts to do in SAS.
#Tech vs Admin
dep$VL <- as.numeric(dep$type == "Labor")
dep$VT <- as.numeric(dep$type == "Tech")
dep.model1 <- lm(dep ~ VL + VT, data=dep)
dep.model2 <- lm(dep ~ VL, data=dep)
anova(dep.model2, dep.model1) #H_0: beta_T = 0
## Analysis of Variance Table
##
## Model 1: dep ~ VL
## Model 2: dep ~ VL + VT
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 37 152398
## 2 36 128506 1 23893 6.6933 0.01386 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Tech vs Labor
dep$VLT <- as.numeric((dep$type == "Labor") | (dep$type == "Tech"))
dep.model3 <- lm(dep ~ VLT, data=dep)
anova(dep.model3, dep.model1) # H_0: beta_T=beta_L
## Analysis of Variance Table
##
## Model 1: dep ~ VLT
## Model 2: dep ~ VL + VT
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 37 130131
## 2 36 128506 1 1624.7 0.4551 0.5042
drop1(dep.model1, test="F") #Run single test of all parameters equal to 0 one by one
## Single term deletions
##
## Model:
## dep ~ VL + VT
## Df Sum of Sq RSS AIC F value Pr(>F)
## <none> 128506 321.91
## VL 1 12886 141392 323.63 3.6099 0.06547 .
## VT 1 23893 152398 326.56 6.6933 0.01386 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
boxplot(dep ~ gend + type, data=dep, col=c("red", "blue"))
dep.2way <- aov(dep ~ gend*type, data=dep)
summary(dep.2way)
## Df Sum Sq Mean Sq F value Pr(>F)
## gend 1 2710 2710 0.730 0.3991
## type 2 23431 11716 3.155 0.0557 .
## gend:type 2 3992 1996 0.538 0.5892
## Residuals 33 122531 3713
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.lm(dep.2way)
##
## Call:
## aov(formula = dep ~ gend * type, data = dep)
##
## Residuals:
## Min 1Q Median 3Q Max
## -99.00 -45.29 -0.40 35.81 126.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.20 27.25 2.870 0.00712 **
## gendMale 13.80 44.50 0.310 0.75843
## typeLabor 61.18 34.74 1.761 0.08750 .
## typeTech 55.80 33.38 1.672 0.10400
## gendMale:typeLabor -26.77 56.45 -0.474 0.63842
## gendMale:typeTech 20.08 53.06 0.378 0.70762
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 60.93 on 33 degrees of freedom
## Multiple R-squared: 0.1974, Adjusted R-squared: 0.07578
## F-statistic: 1.623 on 5 and 33 DF, p-value: 0.1813
Note that the interaction is not significant, so we can drop this term and test for main effects.
dep.main <- aov(dep ~ type + gend, data=dep)
summary(dep.main)
## Df Sum Sq Mean Sq F value Pr(>F)
## type 2 24158 12079 3.341 0.047 *
## gend 1 1983 1983 0.549 0.464
## Residuals 35 126523 3615
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Note that the gender effect is not significant, but type is. Now let’s look at simple effects. Simple effects is what you would test instead of main effects if the interaction was significant.
library(phia)
## Warning: package 'phia' was built under R version 3.4.2
## Loading required package: car
## Warning: package 'car' was built under R version 3.4.2
testInteractions(dep.2way, fixed="type", across="gend", adjustment = "none") #test for gender effect within each level of type
## F Test:
## P-value adjustment method: none
## Value Df Sum of Sq F Pr(>F)
## Admin -13.800 1 357 0.0962 0.7584
## Labor 12.975 1 518 0.1395 0.7112
## Tech -33.875 1 5100 1.3736 0.2496
## Residuals 33 122531
testInteractions(dep.2way, fixed="gend", across="type", adjustment="none") #test for type effect for each gender
## F Test:
## P-value adjustment method: none
## type1 type2 Df Sum of Sq F Pr(>F)
## Female -55.800 5.375 2 13378 1.8015 0.1809
## Male -75.875 -41.475 2 14045 1.8913 0.1669
## Residuals 33 122531
For comparing two groups, we use the nonparametric version of the t-test which is the Wilcoxon Mann Whitney test.
wilcox.test(dep ~ gend, data=dep)
## Warning in wilcox.test.default(x = c(73L, 68L, 82L, 33L, 135L, 69L, 161L, :
## cannot compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: dep by gend
## W = 151, p-value = 0.3533
## alternative hypothesis: true location shift is not equal to 0
For more than two groups, we use the Kruskal Wallis test.
kruskal.test(dep ~ type, data=dep)
##
## Kruskal-Wallis rank sum test
##
## data: dep by type
## Kruskal-Wallis chi-squared = 6.4713, df = 2, p-value = 0.03934
Scatterplots - plot
with(Orange, plot(age,circumference, xlab="Age (days since 12/31/1968",
ylab = "Trunk Cirucmference (mm)"))
Correlation - cor, cor.test for hypothesis test and CI
with(Orange, cor(age,circumference, method="pearson"))
## [1] 0.9135189
with(Orange, cor.test(age,circumference, method="pearson"))
##
## Pearson's product-moment correlation
##
## data: age and circumference
## t = 12.9, df = 33, p-value = 1.931e-14
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8342364 0.9557955
## sample estimates:
## cor
## 0.9135189
with(Orange, cor(age,circumference, method="spearman"))
## [1] 0.9064294
with(Orange, cor.test(age,circumference, method="spearman"))
## Warning in cor.test.default(age, circumference, method = "spearman"):
## Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: age and circumference
## S = 668.09, p-value = 6.712e-14
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9064294
orange.lm <- lm(circumference ~ age, data=Orange)
summary(orange.lm)
##
## Call:
## lm(formula = circumference ~ age, data = Orange)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46.310 -14.946 -0.076 19.697 45.111
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.399650 8.622660 2.018 0.0518 .
## age 0.106770 0.008277 12.900 1.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.74 on 33 degrees of freedom
## Multiple R-squared: 0.8345, Adjusted R-squared: 0.8295
## F-statistic: 166.4 on 1 and 33 DF, p-value: 1.931e-14
plot(orange.lm)