Analyze->Classify->Discriminant

### Two group discriminant function

#install.packages("tidyverse")
#install.packages("glue")
library(tidyverse)

## -- Attaching packages ------------------------------------------------ tidyverse 1.3.0 --

## v tibble  3.0.3     v purrr   0.3.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts --------------------------------------------------- tidyverse_conflicts() --
## x ggplot2::%+%()    masks psych::%+%()
## x ggplot2::alpha()  masks psych::alpha()
## x plyr::arrange()   masks dplyr::arrange()
## x purrr::compact()  masks plyr::compact()
## x plyr::count()     masks dplyr::count()
## x plyr::failwith()  masks dplyr::failwith()
## x dplyr::filter()   masks stats::filter()
## x plyr::id()        masks dplyr::id()
## x dplyr::lag()      masks stats::lag()
## x plyr::mutate()    masks dplyr::mutate()
## x dplyr::recode()   masks car::recode()
## x plyr::rename()    masks dplyr::rename()
## x dplyr::select()   masks MASS::select()
## x purrr::some()     masks car::some()
## x plyr::summarise() masks dplyr::summarise()
## x plyr::summarize() masks dplyr::summarize()

library(MASS)
#install.packages("klaR")
library(klaR)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

theme_set(theme_classic())

# It is common to run the function in a proportion of your sample (training)
# And then validate in another random subset (testing)
# The code below would do this, but we're skipping that here.

#training_sample <- sample(c(TRUE, FALSE), nrow(fitmind), replace = T, prob = c(0.6,0.4))
#train <- fitmind[training_sample, ]
#test <- fitmind[!training_sample, ]

#Below, we'll do a version of this validation, but using the "leave one out" jacknife
#approach used in SPSS

#SPSS does *linear* discriminant analysis. There are many other variants, which I
#don't have here, but google qda (quadratic) and flexible (fda)

#For plotting purposes, it works better in the early functions here if 
#all the variables are in common z-score metric
#Discriminant analysis can be affected by the scale/unit in which predictor 
#variables are measured. It's generally recommended to standardize/normalize 
#continuous predictors before the analysis.


library(psych)

#Creating z-scores, this comes from the psych package. 
#Will need to copy/paste into your spreadsheet

Wk04$zHrQOL<-scale(Wk04$HrQOL, center=TRUE, scale=TRUE)
Wk04$zANXIETY<-scale(Wk04$ANXIETY, center=TRUE, scale=TRUE)
Wk04$zDSB<-scale(Wk04$DSB, center=TRUE, scale=TRUE)
Wk04$zSTROOP<-scale(Wk04$STROOP, center=TRUE, scale=TRUE)
Wk04$zBDI<-scale(Wk04$BDI, center=TRUE, scale=TRUE)

head(Wk04)

##   ï..const ID Intervention    HrQOL  ANXIETY      DSB   STROOP      BDI
## 1        1  1           -1 53.95597 58.56832 47.46240 43.53872 31.35924
## 2        1  2           -1 63.20943 58.61767 43.09310 41.61771 31.34360
## 3        1  3           -1 45.61766 59.47867 42.04582 42.70217 31.78394
## 4        1  4           -1 52.04250 60.72693 43.37183 43.79565 30.96338
## 5        1  5           -1 58.61062 55.68169 43.05642 46.88081 30.77085
## 6        1  6           -1 53.17154 60.86040 38.17686 45.05493 30.79877
##   Intervention2 Intervention3     zHrQOL   zANXIETY       zDSB     zSTROOP
## 1            -1     Radiation -0.7675890  0.5375471  2.1958457  0.27365119
## 2            -1     Radiation  0.5379419  0.5581803  0.8767780 -0.55418922
## 3            -1     Radiation -1.9440046  0.9181946  0.5606078 -0.08685066
## 4            -1     Radiation -1.0375517  1.4401323  0.9609240  0.38437622
## 5            -1     Radiation -0.1108839 -0.6694491  0.8657036  1.71389349
## 6            -1     Radiation -0.8782603  1.4959404 -0.6074091  0.92704991
##        zBDI
## 1 1.0322700
## 2 1.0285342
## 3 1.1337353
## 4 0.9376946
## 5 0.8916959
## 6 0.8983668

#This now creates a reduced data set with only group membership and the z-scored variables
Wk04b<-(Wk04[, c(10:15)])
head(Wk04b)

##   Intervention3     zHrQOL   zANXIETY       zDSB     zSTROOP      zBDI
## 1     Radiation -0.7675890  0.5375471  2.1958457  0.27365119 1.0322700
## 2     Radiation  0.5379419  0.5581803  0.8767780 -0.55418922 1.0285342
## 3     Radiation -1.9440046  0.9181946  0.5606078 -0.08685066 1.1337353
## 4     Radiation -1.0375517  1.4401323  0.9609240  0.38437622 0.9376946
## 5     Radiation -0.1108839 -0.6694491  0.8657036  1.71389349 0.8916959
## 6     Radiation -0.8782603  1.4959404 -0.6074091  0.92704991 0.8983668

#This does the analysis, and gives consistent results, but the numbers are hard to compare
#to spss because a lot of desireable output is missing, including standardized
#coefficients, etc. We'll do something more SPSS-consistent below

lda.Wk04 <- lda(Intervention3 ~ ., Wk04b)
lda.Wk04 #show results

## Call:
## lda(Intervention3 ~ ., data = Wk04b)
## 
## Prior probabilities of groups:
## Immunotherapy     Radiation 
##           0.5           0.5 
## 
## Group means:
##                  zHrQOL    zANXIETY       zDSB    zSTROOP       zBDI
## Immunotherapy  0.443326  0.02650804 -0.8571504 -0.5146311 -0.9712017
## Radiation     -0.443326 -0.02650804  0.8571504  0.5146311  0.9712017
## 
## Coefficients of linear discriminants:
##                  LD1
## zHrQOL   -0.06782866
## zANXIETY -0.19719919
## zDSB      0.80905504
## zSTROOP   0.23859152
## zBDI      4.28036687

#This plot will come out funny, but if you copy to clipboard and change dimensions
#To something like 600:484, it looks good; it is a histogram of discriminant scores
#In each group
plot(lda.Wk04,dimen="1", type="b")

#This saves predicted classification and function scores for each group
Wk04.lda.values <- predict(lda.Wk04)

#This saves the predicted values above to a data frame and then merges (binds) with
#The original data
predout<-as.data.frame(predictions <- lda.Wk04 %>% predict(Wk04b))
Wk04c<-cbind(Wk04b, predout)
head(Wk04c)

##   Intervention3     zHrQOL   zANXIETY       zDSB     zSTROOP      zBDI
## 1     Radiation -0.7675890  0.5375471  2.1958457  0.27365119 1.0322700
## 2     Radiation  0.5379419  0.5581803  0.8767780 -0.55418922 1.0285342
## 3     Radiation -1.9440046  0.9181946  0.5606078 -0.08685066 1.1337353
## 4     Radiation -1.0375517  1.4401323  0.9609240  0.38437622 0.9376946
## 5     Radiation -0.1108839 -0.6694491  0.8657036  1.71389349 0.8916959
## 6     Radiation -0.8782603  1.4959404 -0.6074091  0.92704991 0.8983668
##       class posterior.Immunotherapy posterior.Radiation      LD1
## 1 Radiation            9.981767e-28                   1 6.206406
## 2 Radiation            9.415055e-22                   1 4.833080
## 3 Radiation            1.655903e-23                   1 5.236436
## 4 Radiation            4.861150e-21                   1 4.669209
## 5 Radiation            9.163483e-23                   1 5.065644
## 6 Radiation            2.957016e-15                   1 3.339671

#We now can create a cross-classification table, prevlda, that tells us how well we
#classified participants in our data
#Note this is overly optimistic, but we'll look at jacknifed cross-validation below

(prevlda<-table(Wk04c$Intervention3, Wk04c$class)) #object that saves crosstable

##                
##                 Immunotherapy Radiation
##   Immunotherapy            82         0
##   Radiation                 0        82

mean(predictions$class==Wk04c$Intervention3) #one way of computing %accuracy

## [1] 1

(pre_all<-prevlda[1,1]+prevlda[1,2]+prevlda[2,1]+prevlda[2,2]) #total # participants

## [1] 164

(correctclass<-((prevlda[1,1]+prevlda[2,2]))/pre_all) #% correctly classed participants

## [1] 1

(sens_actual<-(prevlda[1,1]/(prevlda[1,1]+prevlda[1,2]))) #% of correctly classed grp 0

## [1] 1

(spec_actual<-(prevlda[2,2]/(prevlda[2,1]+prevlda[2,2]))) #% of correctly classed grp 1

## [1] 1

#install.packages("candisc")
library(pander)
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(dplyr)
library(candisc)

## 
## Attaching package: 'candisc'

## The following object is masked from 'package:stats':
## 
##     cancor

# This demonstrates discriminant function as the back end of a MANOVA
# First, we estimate the multivariate linear model
# We'll get the overall effect of group, but also the univariates, which 
# SPSS prints preliminary to the discriminant function
mm <- lm(cbind(zHrQOL,zANXIETY,zDSB,zSTROOP,zBDI) ~ Intervention3, Wk04c)
heplots::etasq(mm)

##                   eta^2
## Intervention3 0.9621161

summary(mm)

## Response zHrQOL :
## 
## Call:
## lm(formula = zHrQOL ~ Intervention3, data = Wk04c)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.53326 -0.65551  0.03765  0.56876  2.31571 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.44333    0.09922   4.468 1.47e-05 ***
## Intervention3Radiation -0.88665    0.14031  -6.319 2.44e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8984 on 162 degrees of freedom
## Multiple R-squared:  0.1977, Adjusted R-squared:  0.1928 
## F-statistic: 39.93 on 1 and 162 DF,  p-value: 2.442e-09
## 
## 
## Response zANXIETY :
## 
## Call:
## lm(formula = zANXIETY ~ Intervention3, data = Wk04c)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.92203 -0.68646  0.06235  0.77643  2.89826 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)             0.02651    0.11073   0.239    0.811
## Intervention3Radiation -0.05302    0.15660  -0.339    0.735
## 
## Residual standard error: 1.003 on 162 degrees of freedom
## Multiple R-squared:  0.000707,   Adjusted R-squared:  -0.005461 
## F-statistic: 0.1146 on 1 and 162 DF,  p-value: 0.7354
## 
## 
## Response zDSB :
## 
## Call:
## lm(formula = zDSB ~ Intervention3, data = Wk04c)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.51000 -0.31602  0.01638  0.32802  1.33870 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -0.85715    0.05657  -15.15   <2e-16 ***
## Intervention3Radiation  1.71430    0.08000   21.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5122 on 162 degrees of freedom
## Multiple R-squared:  0.7392, Adjusted R-squared:  0.7376 
## F-statistic: 459.2 on 1 and 162 DF,  p-value: < 2.2e-16
## 
## 
## Response zSTROOP :
## 
## Call:
## lm(formula = zSTROOP ~ Intervention3, data = Wk04c)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.81840 -0.57571  0.00784  0.59887  2.02610 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -0.51463    0.09487  -5.424 2.08e-07 ***
## Intervention3Radiation  1.02926    0.13417   7.671 1.50e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8591 on 162 degrees of freedom
## Multiple R-squared:  0.2665, Adjusted R-squared:  0.2619 
## F-statistic: 58.85 on 1 and 162 DF,  p-value: 1.498e-12
## 
## 
## Response zBDI :
## 
## Call:
## lm(formula = zBDI ~ Intervention3, data = Wk04c)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.51825 -0.12939  0.01385  0.12417  0.65692 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -0.97120    0.02501  -38.83   <2e-16 ***
## Intervention3Radiation  1.94240    0.03537   54.91   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2265 on 162 degrees of freedom
## Multiple R-squared:  0.949,  Adjusted R-squared:  0.9487 
## F-statistic:  3016 on 1 and 162 DF,  p-value: < 2.2e-16

head(Wk04c)

##   Intervention3     zHrQOL   zANXIETY       zDSB     zSTROOP      zBDI
## 1     Radiation -0.7675890  0.5375471  2.1958457  0.27365119 1.0322700
## 2     Radiation  0.5379419  0.5581803  0.8767780 -0.55418922 1.0285342
## 3     Radiation -1.9440046  0.9181946  0.5606078 -0.08685066 1.1337353
## 4     Radiation -1.0375517  1.4401323  0.9609240  0.38437622 0.9376946
## 5     Radiation -0.1108839 -0.6694491  0.8657036  1.71389349 0.8916959
## 6     Radiation -0.8782603  1.4959404 -0.6074091  0.92704991 0.8983668
##       class posterior.Immunotherapy posterior.Radiation      LD1
## 1 Radiation            9.981767e-28                   1 6.206406
## 2 Radiation            9.415055e-22                   1 4.833080
## 3 Radiation            1.655903e-23                   1 5.236436
## 4 Radiation            4.861150e-21                   1 4.669209
## 5 Radiation            9.163483e-23                   1 5.065644
## 6 Radiation            2.957016e-15                   1 3.339671

#This introduces a new function, from the candisc (canonical discriminant) package
#It produces output more consistent with SPSS

cdl<-candiscList(mm)
cdl %>% summary(coef = c("raw", "std", "structure"))

## 
## Term: Intervention3 
## 
## Canonical Discriminant Analysis for Intervention3:
## 
##    CanRsq Eigenvalue Difference Percent Cumulative
## 1 0.96212     25.396                100        100
## 
## Class means:
## 
## [1]  5.0087 -5.0087
## 
##  raw coefficients:
## [1]  0.067829  0.197199 -0.809055 -0.238592 -4.280367
## 
##  std coefficients:
## [1]  0.060941  0.197737 -0.414435 -0.204975 -0.969437
## 
##  structure coefficients:
## [1]  0.453354  0.027108 -0.876539 -0.526272 -0.993171

cdl %>% plot

# This produces a nice, printable table of standardized coefficients. 
# options are "coeffs.std", "coeffs.raw" and "structure"
cbind(cdl$Intervention3$coeffs.std) %>%
  set_colnames(c("Intervention Group")) %>%
  pander("Std Coefficients")

Std Coefficients
Intervention Group
0.06094
0.1977
-0.4144
-0.205
-0.9694

#In the plot below, we get two panels
#Left panel plots canonical discriminant function boxplots for the two groups
#Right panel plots standardized coefficients for each measure. 
#With the vlim command in the second plot, I set it to the min and max of the
#the standardized coefficients. This is a very cool plot!

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

p1 <- ggplot(cdl$Intervention3$scores, aes(Intervention3,Can1)) + 
  geom_boxplot() + 
  ylab("Canonical Variate for Intervention Group") +
  xlab("Intervention Group")
p1

p2 <- ggplot(cdl$Intervention3$coeffs.std %>% 
               as.data.frame %>% 
               tibble::rownames_to_column(var = "Measure"),
             aes(Measure, Can1)) + 
  geom_segment(aes(xend = Measure, yend = 0), 
               arrow = arrow(type = "closed", 
                             ends = "first", 
                             angle = "15", 
                             length = unit(0.6, "cm"))) +
  scale_x_discrete(name ="Instrument",labels=c("HrQOL","Anxiety", "Dig span B", "Stroop", "BDI"))+
  ylim(-1,1) + 
  ylab("Standardized Coefficients for Intervention Group")
gridExtra::grid.arrange(p1,p2, nrow = 1, ncol = 2)

p2

# Helper function to do leave-one-out cross-validation
# This is comparable to the original lda analysis, and not the candisc version
# Candisc version gives us all kinds of diagnostics but doesn't directly match
# the 
vlda = function(v,formula,data,cl){
  require(MASS)
  grps = cut(1:nrow(data),v,labels=FALSE)[sample(1:nrow(data))]
  pred = lapply(1:v,function(i,formula,data){
    omit = which(grps == i)
    z = lda(formula,data=data[-omit,])
    predict(z,data[omit,])
  },formula,data)
  
  wh = unlist(lapply(pred,function(pp)pp$class))
  table(wh,cl[order(grps)])
}
#fitmind4<-fitmind[2:12]
#This function accepts four arguments: 
#v, the number of folds in the cross classification, 
# formula which is the formula used in the linear discriminant analysis, 
# data which is the data frame to use, and 
# cl, the classification variable 
# By using the sample function, we make sure that the groups that are used for 
#cross-validation aren't influenced by the ordering of the data - notice how the 
#classification variable (cl) is indexed by order(grps) to make sure that the 
#predicted and actual values line up properly.
#Applying this function to the fitmind data will give us a better idea of the 
#actual error rate of the classifier:
head(Wk04c)

##   Intervention3     zHrQOL   zANXIETY       zDSB     zSTROOP      zBDI
## 1     Radiation -0.7675890  0.5375471  2.1958457  0.27365119 1.0322700
## 2     Radiation  0.5379419  0.5581803  0.8767780 -0.55418922 1.0285342
## 3     Radiation -1.9440046  0.9181946  0.5606078 -0.08685066 1.1337353
## 4     Radiation -1.0375517  1.4401323  0.9609240  0.38437622 0.9376946
## 5     Radiation -0.1108839 -0.6694491  0.8657036  1.71389349 0.8916959
## 6     Radiation -0.8782603  1.4959404 -0.6074091  0.92704991 0.8983668
##       class posterior.Immunotherapy posterior.Radiation      LD1
## 1 Radiation            9.981767e-28                   1 6.206406
## 2 Radiation            9.415055e-22                   1 4.833080
## 3 Radiation            1.655903e-23                   1 5.236436
## 4 Radiation            4.861150e-21                   1 4.669209
## 5 Radiation            9.163483e-23                   1 5.065644
## 6 Radiation            2.957016e-15                   1 3.339671

vlda(164,Intervention3~.,Wk04c[,1:6],Wk04c$Intervention3)

##                
## wh              Immunotherapy Radiation
##   Immunotherapy            82         0
##   Radiation                 0        82

(getvlda<-vlda(164,Intervention3~.,Wk04c[,1:6],Wk04c$Intervention3) )

##                
## wh              Immunotherapy Radiation
##   Immunotherapy            82         0
##   Radiation                 0        82

(all<-getvlda[1,1]+getvlda[1,2]+getvlda[2,1]+getvlda[2,2])

## [1] 164

(pred_jack<-(getvlda[1,1]+getvlda[2,2])/all)

## [1] 1

(sens_jack<-(getvlda[1,1]/(getvlda[1,1]+getvlda[2,1])))

## [1] 1

(spec_jack<-(getvlda[2,2]/(getvlda[1,2]+getvlda[2,2])))

## [1] 1