### Two group discriminant function
#install.packages("tidyverse")
#install.packages("glue")
library(tidyverse)
## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --
## v tibble 3.0.3 v purrr 0.3.4
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::%+%() masks psych::%+%()
## x ggplot2::alpha() masks psych::alpha()
## x plyr::arrange() masks dplyr::arrange()
## x purrr::compact() masks plyr::compact()
## x plyr::count() masks dplyr::count()
## x plyr::failwith() masks dplyr::failwith()
## x dplyr::filter() masks stats::filter()
## x plyr::id() masks dplyr::id()
## x dplyr::lag() masks stats::lag()
## x plyr::mutate() masks dplyr::mutate()
## x dplyr::recode() masks car::recode()
## x plyr::rename() masks dplyr::rename()
## x dplyr::select() masks MASS::select()
## x purrr::some() masks car::some()
## x plyr::summarise() masks dplyr::summarise()
## x plyr::summarize() masks dplyr::summarize()
library(MASS)
#install.packages("klaR")
library(klaR)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
theme_set(theme_classic())
# It is common to run the function in a proportion of your sample (training)
# And then validate in another random subset (testing)
# The code below would do this, but we're skipping that here.
#training_sample <- sample(c(TRUE, FALSE), nrow(fitmind), replace = T, prob = c(0.6,0.4))
#train <- fitmind[training_sample, ]
#test <- fitmind[!training_sample, ]
#Below, we'll do a version of this validation, but using the "leave one out" jacknife
#approach used in SPSS
#SPSS does *linear* discriminant analysis. There are many other variants, which I
#don't have here, but google qda (quadratic) and flexible (fda)
#For plotting purposes, it works better in the early functions here if
#all the variables are in common z-score metric
#Discriminant analysis can be affected by the scale/unit in which predictor
#variables are measured. It's generally recommended to standardize/normalize
#continuous predictors before the analysis.
library(psych)
#Creating z-scores, this comes from the psych package.
#Will need to copy/paste into your spreadsheet
Wk04$zHrQOL<-scale(Wk04$HrQOL, center=TRUE, scale=TRUE)
Wk04$zANXIETY<-scale(Wk04$ANXIETY, center=TRUE, scale=TRUE)
Wk04$zDSB<-scale(Wk04$DSB, center=TRUE, scale=TRUE)
Wk04$zSTROOP<-scale(Wk04$STROOP, center=TRUE, scale=TRUE)
Wk04$zBDI<-scale(Wk04$BDI, center=TRUE, scale=TRUE)
head(Wk04)
## ï..const ID Intervention HrQOL ANXIETY DSB STROOP BDI
## 1 1 1 -1 53.95597 58.56832 47.46240 43.53872 31.35924
## 2 1 2 -1 63.20943 58.61767 43.09310 41.61771 31.34360
## 3 1 3 -1 45.61766 59.47867 42.04582 42.70217 31.78394
## 4 1 4 -1 52.04250 60.72693 43.37183 43.79565 30.96338
## 5 1 5 -1 58.61062 55.68169 43.05642 46.88081 30.77085
## 6 1 6 -1 53.17154 60.86040 38.17686 45.05493 30.79877
## Intervention2 Intervention3 zHrQOL zANXIETY zDSB zSTROOP
## 1 -1 Radiation -0.7675890 0.5375471 2.1958457 0.27365119
## 2 -1 Radiation 0.5379419 0.5581803 0.8767780 -0.55418922
## 3 -1 Radiation -1.9440046 0.9181946 0.5606078 -0.08685066
## 4 -1 Radiation -1.0375517 1.4401323 0.9609240 0.38437622
## 5 -1 Radiation -0.1108839 -0.6694491 0.8657036 1.71389349
## 6 -1 Radiation -0.8782603 1.4959404 -0.6074091 0.92704991
## zBDI
## 1 1.0322700
## 2 1.0285342
## 3 1.1337353
## 4 0.9376946
## 5 0.8916959
## 6 0.8983668
#This now creates a reduced data set with only group membership and the z-scored variables
Wk04b<-(Wk04[, c(10:15)])
head(Wk04b)
## Intervention3 zHrQOL zANXIETY zDSB zSTROOP zBDI
## 1 Radiation -0.7675890 0.5375471 2.1958457 0.27365119 1.0322700
## 2 Radiation 0.5379419 0.5581803 0.8767780 -0.55418922 1.0285342
## 3 Radiation -1.9440046 0.9181946 0.5606078 -0.08685066 1.1337353
## 4 Radiation -1.0375517 1.4401323 0.9609240 0.38437622 0.9376946
## 5 Radiation -0.1108839 -0.6694491 0.8657036 1.71389349 0.8916959
## 6 Radiation -0.8782603 1.4959404 -0.6074091 0.92704991 0.8983668
#This does the analysis, and gives consistent results, but the numbers are hard to compare
#to spss because a lot of desireable output is missing, including standardized
#coefficients, etc. We'll do something more SPSS-consistent below
lda.Wk04 <- lda(Intervention3 ~ ., Wk04b)
lda.Wk04 #show results
## Call:
## lda(Intervention3 ~ ., data = Wk04b)
##
## Prior probabilities of groups:
## Immunotherapy Radiation
## 0.5 0.5
##
## Group means:
## zHrQOL zANXIETY zDSB zSTROOP zBDI
## Immunotherapy 0.443326 0.02650804 -0.8571504 -0.5146311 -0.9712017
## Radiation -0.443326 -0.02650804 0.8571504 0.5146311 0.9712017
##
## Coefficients of linear discriminants:
## LD1
## zHrQOL -0.06782866
## zANXIETY -0.19719919
## zDSB 0.80905504
## zSTROOP 0.23859152
## zBDI 4.28036687
#This plot will come out funny, but if you copy to clipboard and change dimensions
#To something like 600:484, it looks good; it is a histogram of discriminant scores
#In each group
plot(lda.Wk04,dimen="1", type="b")
#This saves predicted classification and function scores for each group
Wk04.lda.values <- predict(lda.Wk04)
#This saves the predicted values above to a data frame and then merges (binds) with
#The original data
predout<-as.data.frame(predictions <- lda.Wk04 %>% predict(Wk04b))
Wk04c<-cbind(Wk04b, predout)
head(Wk04c)
## Intervention3 zHrQOL zANXIETY zDSB zSTROOP zBDI
## 1 Radiation -0.7675890 0.5375471 2.1958457 0.27365119 1.0322700
## 2 Radiation 0.5379419 0.5581803 0.8767780 -0.55418922 1.0285342
## 3 Radiation -1.9440046 0.9181946 0.5606078 -0.08685066 1.1337353
## 4 Radiation -1.0375517 1.4401323 0.9609240 0.38437622 0.9376946
## 5 Radiation -0.1108839 -0.6694491 0.8657036 1.71389349 0.8916959
## 6 Radiation -0.8782603 1.4959404 -0.6074091 0.92704991 0.8983668
## class posterior.Immunotherapy posterior.Radiation LD1
## 1 Radiation 9.981767e-28 1 6.206406
## 2 Radiation 9.415055e-22 1 4.833080
## 3 Radiation 1.655903e-23 1 5.236436
## 4 Radiation 4.861150e-21 1 4.669209
## 5 Radiation 9.163483e-23 1 5.065644
## 6 Radiation 2.957016e-15 1 3.339671
#We now can create a cross-classification table, prevlda, that tells us how well we
#classified participants in our data
#Note this is overly optimistic, but we'll look at jacknifed cross-validation below
(prevlda<-table(Wk04c$Intervention3, Wk04c$class)) #object that saves crosstable
##
## Immunotherapy Radiation
## Immunotherapy 82 0
## Radiation 0 82
mean(predictions$class==Wk04c$Intervention3) #one way of computing %accuracy
## [1] 1
(pre_all<-prevlda[1,1]+prevlda[1,2]+prevlda[2,1]+prevlda[2,2]) #total # participants
## [1] 164
(correctclass<-((prevlda[1,1]+prevlda[2,2]))/pre_all) #% correctly classed participants
## [1] 1
(sens_actual<-(prevlda[1,1]/(prevlda[1,1]+prevlda[1,2]))) #% of correctly classed grp 0
## [1] 1
(spec_actual<-(prevlda[2,2]/(prevlda[2,1]+prevlda[2,2]))) #% of correctly classed grp 1
## [1] 1
#install.packages("candisc")
library(pander)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(dplyr)
library(candisc)
##
## Attaching package: 'candisc'
## The following object is masked from 'package:stats':
##
## cancor
# This demonstrates discriminant function as the back end of a MANOVA
# First, we estimate the multivariate linear model
# We'll get the overall effect of group, but also the univariates, which
# SPSS prints preliminary to the discriminant function
mm <- lm(cbind(zHrQOL,zANXIETY,zDSB,zSTROOP,zBDI) ~ Intervention3, Wk04c)
heplots::etasq(mm)
## eta^2
## Intervention3 0.9621161
summary(mm)
## Response zHrQOL :
##
## Call:
## lm(formula = zHrQOL ~ Intervention3, data = Wk04c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.53326 -0.65551 0.03765 0.56876 2.31571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.44333 0.09922 4.468 1.47e-05 ***
## Intervention3Radiation -0.88665 0.14031 -6.319 2.44e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8984 on 162 degrees of freedom
## Multiple R-squared: 0.1977, Adjusted R-squared: 0.1928
## F-statistic: 39.93 on 1 and 162 DF, p-value: 2.442e-09
##
##
## Response zANXIETY :
##
## Call:
## lm(formula = zANXIETY ~ Intervention3, data = Wk04c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.92203 -0.68646 0.06235 0.77643 2.89826
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.02651 0.11073 0.239 0.811
## Intervention3Radiation -0.05302 0.15660 -0.339 0.735
##
## Residual standard error: 1.003 on 162 degrees of freedom
## Multiple R-squared: 0.000707, Adjusted R-squared: -0.005461
## F-statistic: 0.1146 on 1 and 162 DF, p-value: 0.7354
##
##
## Response zDSB :
##
## Call:
## lm(formula = zDSB ~ Intervention3, data = Wk04c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.51000 -0.31602 0.01638 0.32802 1.33870
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.85715 0.05657 -15.15 <2e-16 ***
## Intervention3Radiation 1.71430 0.08000 21.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5122 on 162 degrees of freedom
## Multiple R-squared: 0.7392, Adjusted R-squared: 0.7376
## F-statistic: 459.2 on 1 and 162 DF, p-value: < 2.2e-16
##
##
## Response zSTROOP :
##
## Call:
## lm(formula = zSTROOP ~ Intervention3, data = Wk04c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.81840 -0.57571 0.00784 0.59887 2.02610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.51463 0.09487 -5.424 2.08e-07 ***
## Intervention3Radiation 1.02926 0.13417 7.671 1.50e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8591 on 162 degrees of freedom
## Multiple R-squared: 0.2665, Adjusted R-squared: 0.2619
## F-statistic: 58.85 on 1 and 162 DF, p-value: 1.498e-12
##
##
## Response zBDI :
##
## Call:
## lm(formula = zBDI ~ Intervention3, data = Wk04c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.51825 -0.12939 0.01385 0.12417 0.65692
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.97120 0.02501 -38.83 <2e-16 ***
## Intervention3Radiation 1.94240 0.03537 54.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2265 on 162 degrees of freedom
## Multiple R-squared: 0.949, Adjusted R-squared: 0.9487
## F-statistic: 3016 on 1 and 162 DF, p-value: < 2.2e-16
head(Wk04c)
## Intervention3 zHrQOL zANXIETY zDSB zSTROOP zBDI
## 1 Radiation -0.7675890 0.5375471 2.1958457 0.27365119 1.0322700
## 2 Radiation 0.5379419 0.5581803 0.8767780 -0.55418922 1.0285342
## 3 Radiation -1.9440046 0.9181946 0.5606078 -0.08685066 1.1337353
## 4 Radiation -1.0375517 1.4401323 0.9609240 0.38437622 0.9376946
## 5 Radiation -0.1108839 -0.6694491 0.8657036 1.71389349 0.8916959
## 6 Radiation -0.8782603 1.4959404 -0.6074091 0.92704991 0.8983668
## class posterior.Immunotherapy posterior.Radiation LD1
## 1 Radiation 9.981767e-28 1 6.206406
## 2 Radiation 9.415055e-22 1 4.833080
## 3 Radiation 1.655903e-23 1 5.236436
## 4 Radiation 4.861150e-21 1 4.669209
## 5 Radiation 9.163483e-23 1 5.065644
## 6 Radiation 2.957016e-15 1 3.339671
#This introduces a new function, from the candisc (canonical discriminant) package
#It produces output more consistent with SPSS
cdl<-candiscList(mm)
cdl %>% summary(coef = c("raw", "std", "structure"))
##
## Term: Intervention3
##
## Canonical Discriminant Analysis for Intervention3:
##
## CanRsq Eigenvalue Difference Percent Cumulative
## 1 0.96212 25.396 100 100
##
## Class means:
##
## [1] 5.0087 -5.0087
##
## raw coefficients:
## [1] 0.067829 0.197199 -0.809055 -0.238592 -4.280367
##
## std coefficients:
## [1] 0.060941 0.197737 -0.414435 -0.204975 -0.969437
##
## structure coefficients:
## [1] 0.453354 0.027108 -0.876539 -0.526272 -0.993171
cdl %>% plot
# This produces a nice, printable table of standardized coefficients.
# options are "coeffs.std", "coeffs.raw" and "structure"
cbind(cdl$Intervention3$coeffs.std) %>%
set_colnames(c("Intervention Group")) %>%
pander("Std Coefficients")
Std Coefficients
0.06094 |
0.1977 |
-0.4144 |
-0.205 |
-0.9694 |
#In the plot below, we get two panels
#Left panel plots canonical discriminant function boxplots for the two groups
#Right panel plots standardized coefficients for each measure.
#With the vlim command in the second plot, I set it to the min and max of the
#the standardized coefficients. This is a very cool plot!
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
p1 <- ggplot(cdl$Intervention3$scores, aes(Intervention3,Can1)) +
geom_boxplot() +
ylab("Canonical Variate for Intervention Group") +
xlab("Intervention Group")
p1
p2 <- ggplot(cdl$Intervention3$coeffs.std %>%
as.data.frame %>%
tibble::rownames_to_column(var = "Measure"),
aes(Measure, Can1)) +
geom_segment(aes(xend = Measure, yend = 0),
arrow = arrow(type = "closed",
ends = "first",
angle = "15",
length = unit(0.6, "cm"))) +
scale_x_discrete(name ="Instrument",labels=c("HrQOL","Anxiety", "Dig span B", "Stroop", "BDI"))+
ylim(-1,1) +
ylab("Standardized Coefficients for Intervention Group")
gridExtra::grid.arrange(p1,p2, nrow = 1, ncol = 2)
p2
# Helper function to do leave-one-out cross-validation
# This is comparable to the original lda analysis, and not the candisc version
# Candisc version gives us all kinds of diagnostics but doesn't directly match
# the
vlda = function(v,formula,data,cl){
require(MASS)
grps = cut(1:nrow(data),v,labels=FALSE)[sample(1:nrow(data))]
pred = lapply(1:v,function(i,formula,data){
omit = which(grps == i)
z = lda(formula,data=data[-omit,])
predict(z,data[omit,])
},formula,data)
wh = unlist(lapply(pred,function(pp)pp$class))
table(wh,cl[order(grps)])
}
#fitmind4<-fitmind[2:12]
#This function accepts four arguments:
#v, the number of folds in the cross classification,
# formula which is the formula used in the linear discriminant analysis,
# data which is the data frame to use, and
# cl, the classification variable
# By using the sample function, we make sure that the groups that are used for
#cross-validation aren't influenced by the ordering of the data - notice how the
#classification variable (cl) is indexed by order(grps) to make sure that the
#predicted and actual values line up properly.
#Applying this function to the fitmind data will give us a better idea of the
#actual error rate of the classifier:
head(Wk04c)
## Intervention3 zHrQOL zANXIETY zDSB zSTROOP zBDI
## 1 Radiation -0.7675890 0.5375471 2.1958457 0.27365119 1.0322700
## 2 Radiation 0.5379419 0.5581803 0.8767780 -0.55418922 1.0285342
## 3 Radiation -1.9440046 0.9181946 0.5606078 -0.08685066 1.1337353
## 4 Radiation -1.0375517 1.4401323 0.9609240 0.38437622 0.9376946
## 5 Radiation -0.1108839 -0.6694491 0.8657036 1.71389349 0.8916959
## 6 Radiation -0.8782603 1.4959404 -0.6074091 0.92704991 0.8983668
## class posterior.Immunotherapy posterior.Radiation LD1
## 1 Radiation 9.981767e-28 1 6.206406
## 2 Radiation 9.415055e-22 1 4.833080
## 3 Radiation 1.655903e-23 1 5.236436
## 4 Radiation 4.861150e-21 1 4.669209
## 5 Radiation 9.163483e-23 1 5.065644
## 6 Radiation 2.957016e-15 1 3.339671
vlda(164,Intervention3~.,Wk04c[,1:6],Wk04c$Intervention3)
##
## wh Immunotherapy Radiation
## Immunotherapy 82 0
## Radiation 0 82
(getvlda<-vlda(164,Intervention3~.,Wk04c[,1:6],Wk04c$Intervention3) )
##
## wh Immunotherapy Radiation
## Immunotherapy 82 0
## Radiation 0 82
(all<-getvlda[1,1]+getvlda[1,2]+getvlda[2,1]+getvlda[2,2])
## [1] 164
(pred_jack<-(getvlda[1,1]+getvlda[2,2])/all)
## [1] 1
(sens_jack<-(getvlda[1,1]/(getvlda[1,1]+getvlda[2,1])))
## [1] 1
(spec_jack<-(getvlda[2,2]/(getvlda[1,2]+getvlda[2,2])))
## [1] 1