### Principal Components Analysis

### First, set the working directory where R will find the data

setwd("H:/data/User/Classes/multiv_13/YTs/YTSplit/Week 08 wmv - 2016/Wk08.R")
getwd()
## [1] "H:/data/User/Classes/multiv_13/YTs/YTSplit/Week 08 wmv - 2016/Wk08.R"
### Read the data
pca1<- read.csv("Class09a_practice_2014.csv")
head(pca1)
##   ï..const VAR00001 VAR00002 VAR00003 VAR00004 VAR00005 VAR00006 VAR00007
## 1 1 51.21346 45.64752 47.00907 45.31986 37.04117 42.25370 49.42760
## 2 1 36.87658 39.37590 40.37346 61.29191 59.05488 56.30463 49.77283
## 3 1 34.22846 40.95423 44.51697 24.13848 37.21853 41.79234 41.29289
## 4 1 49.71161 47.51437 43.35686 42.71637 47.56493 45.29507 49.82437
## 5 1 38.80370 42.88313 39.37815 55.35847 57.54772 52.60606 60.10324
## 6 1 56.16847 44.82923 54.10363 56.44973 46.21895 50.90457 65.33737
## VAR00008 VAR00009
## 1 51.59151 52.96864
## 2 48.41863 47.49123
## 3 31.66971 39.46853
## 4 40.47604 50.41910
## 5 51.19566 54.24683
## 6 60.89935 62.72507
#install.packages("psych")
library(psych)

#some of the operations below work on the correlation matrix among variables
#The cor function saves our 9 variables to a R matrix
head(pca1)
##   ï..const VAR00001 VAR00002 VAR00003 VAR00004 VAR00005 VAR00006 VAR00007
## 1 1 51.21346 45.64752 47.00907 45.31986 37.04117 42.25370 49.42760
## 2 1 36.87658 39.37590 40.37346 61.29191 59.05488 56.30463 49.77283
## 3 1 34.22846 40.95423 44.51697 24.13848 37.21853 41.79234 41.29289
## 4 1 49.71161 47.51437 43.35686 42.71637 47.56493 45.29507 49.82437
## 5 1 38.80370 42.88313 39.37815 55.35847 57.54772 52.60606 60.10324
## 6 1 56.16847 44.82923 54.10363 56.44973 46.21895 50.90457 65.33737
## VAR00008 VAR00009
## 1 51.59151 52.96864
## 2 48.41863 47.49123
## 3 31.66971 39.46853
## 4 40.47604 50.41910
## 5 51.19566 54.24683
## 6 60.89935 62.72507
R<-cor(pca1[2:10]) #saving the correlation matrix
R
##               VAR00001    VAR00002    VAR00003     VAR00004     VAR00005
## VAR00001 1.0000000000 0.70567853 0.80100600 0.007286975 -0.007413772
## VAR00002 0.7056785274 1.00000000 0.76481968 -0.023655038 -0.031219809
## VAR00003 0.8010059962 0.76481968 1.00000000 -0.044091738 -0.057062711
## VAR00004 0.0072869750 -0.02365504 -0.04409174 1.000000000 0.666264760
## VAR00005 -0.0074137723 -0.03121981 -0.05706271 0.666264760 1.000000000
## VAR00006 0.0002389932 -0.01945949 -0.04373219 0.792004318 0.735822979
## VAR00007 0.0313387643 0.03215471 0.02790158 0.020178123 -0.069409954
## VAR00008 0.0780188199 0.06640387 0.06404979 0.011438271 -0.044932184
## VAR00009 0.0296012888 0.02528933 0.04306396 0.014613354 -0.056498434
## VAR00006 VAR00007 VAR00008 VAR00009
## VAR00001 0.0002389932 0.03133876 0.07801882 0.02960129
## VAR00002 -0.0194594938 0.03215471 0.06640387 0.02528933
## VAR00003 -0.0437321894 0.02790158 0.06404979 0.04306396
## VAR00004 0.7920043177 0.02017812 0.01143827 0.01461335
## VAR00005 0.7358229795 -0.06940995 -0.04493218 -0.05649843
## VAR00006 1.0000000000 0.03351174 0.02305362 0.03564841
## VAR00007 0.0335117374 1.00000000 0.71246075 0.79881659
## VAR00008 0.0230536230 0.71246075 1.00000000 0.74893095
## VAR00009 0.0356484088 0.79881659 0.74893095 1.00000000
#We didn't talk about this in SPSS, but it exists there
#Barlett's test is summarized below
#The nrow part is indicating that n equals the number of rows in our dataset.
#If not, it will default to n=100.
cortest.bartlett(R,n=nrow(pca1))
## $chisq
## [1] 2843.964
##
## $p.value
## [1] 0
##
## $df
## [1] 36
#Results indicate that the p value is < .001 (not really 0!) and 
#is statistically significant. PCA can be done.

#Kaiser-Meyer-Olkin (KMO) Test is a measure of how suited
#your data is for Factor Analysis. The test measures sampling
#adequacy for each variable in the model and for the complete model.
#The statistic is a measure of the proportion of variance among
#variables that might be common variance. The higher the proportion,
#the more suited your data is to Factor Analysis.
#Again, I never use it, but now you know about it
KMO(R)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = R)
## Overall MSA = 0.74
## MSA for each item =
## VAR00001 VAR00002 VAR00003 VAR00004 VAR00005 VAR00006 VAR00007 VAR00008
## 0.74 0.79 0.69 0.73 0.79 0.67 0.74 0.80
## VAR00009
## 0.70
p<-ncol(pca1[2:10]) #no of variables

#Full Components Solution
#Setting n-factor to p (num vars) creates a full components solution
#forcing to extract p=10 components
pca<-principal(pca1[2:10],nfactor=3,rotate="none") #pca
print(pca)
## Principal Components Analysis
## Call: principal(r = pca1[2:10], nfactors = 3, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PC1 PC2 PC3 h2 u2 com
## VAR00001 0.66 -0.08 0.62 0.83 0.17 2.0
## VAR00002 0.66 -0.10 0.60 0.81 0.19 2.0
## VAR00003 0.69 -0.13 0.61 0.88 0.12 2.0
## VAR00004 -0.25 0.78 0.38 0.82 0.18 1.7
## VAR00005 -0.31 0.72 0.41 0.78 0.22 2.0
## VAR00006 -0.25 0.82 0.38 0.87 0.13 1.6
## VAR00007 0.59 0.46 -0.52 0.84 0.16 2.9
## VAR00008 0.61 0.45 -0.48 0.80 0.20 2.8
## VAR00009 0.60 0.47 -0.53 0.87 0.13 2.9
##
## PC1 PC2 PC3
## SS loadings 2.66 2.47 2.36
## Proportion Var 0.30 0.27 0.26
## Cumulative Var 0.30 0.57 0.83
## Proportion Explained 0.36 0.33 0.31
## Cumulative Proportion 0.36 0.69 1.00
##
## Mean item complexity = 2.2
## Test of the hypothesis that 3 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0.04
## with the empirical chi square 69.95 with prob < 3.3e-10
##
## Fit based upon off diagonal values = 0.99
### The psych package has more options and features, 
### like reordering loadings by size. This is a fuller specification
pc.results<-principal(pca1[2:10], nfactors = 9, residuals = FALSE,rotate="none",
n.obs=NA, covar=FALSE,
scores=TRUE,missing=FALSE,impute="median",oblique.scores=TRUE,
method="regression")

fa.sort(pc.results,polar=FALSE)
## Principal Components Analysis
## Call: principal(r = pca1[2:10], nfactors = 9, residuals = FALSE, rotate = "none",
## n.obs = NA, covar = FALSE, scores = TRUE, missing = FALSE,
## impute = "median", oblique.scores = TRUE, method = "regression")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 h2 u2 com
## VAR00003 0.69 -0.13 0.61 -0.01 -0.06 0.07 -0.19 0.03 0.28 1 -4.4e-16 2.6
## VAR00001 0.66 -0.08 0.62 0.00 -0.31 0.13 0.14 -0.04 -0.19 1 -8.9e-16 2.8
## VAR00002 0.66 -0.10 0.60 0.01 0.38 -0.18 0.06 0.01 -0.10 1 8.9e-16 2.9
## VAR00008 0.61 0.45 -0.48 0.17 -0.17 -0.37 0.04 -0.04 0.06 1 6.7e-16 4.0
## VAR00009 0.60 0.47 -0.53 -0.01 0.05 0.13 -0.27 0.10 -0.18 1 -2.2e-16 3.8
## VAR00007 0.59 0.46 -0.52 -0.11 0.13 0.23 0.24 -0.05 0.13 1 8.9e-16 4.0
## VAR00006 -0.25 0.82 0.38 -0.10 0.02 -0.01 -0.11 -0.33 -0.02 1 -4.4e-16 2.1
## VAR00004 -0.25 0.78 0.38 -0.32 -0.06 -0.11 0.06 0.23 0.01 1 -8.9e-16 2.4
## VAR00005 -0.31 0.72 0.41 0.43 0.05 0.12 0.05 0.11 0.02 1 0.0e+00 3.0
##
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
## SS loadings 2.66 2.47 2.36 0.34 0.30 0.29 0.21 0.19 0.18
## Proportion Var 0.30 0.27 0.26 0.04 0.03 0.03 0.02 0.02 0.02
## Cumulative Var 0.30 0.57 0.83 0.87 0.90 0.94 0.96 0.98 1.00
## Proportion Explained 0.30 0.27 0.26 0.04 0.03 0.03 0.02 0.02 0.02
## Cumulative Proportion 0.30 0.57 0.83 0.87 0.90 0.94 0.96 0.98 1.00
##
## Mean item complexity = 3.1
## Test of the hypothesis that 9 components are sufficient.
##
## The root mean square of the residuals (RMSR) is 0
## with the empirical chi square 0 with prob < NA
##
## Fit based upon off diagonal values = 1
#To save component scores to your data frame
pcscore<-as.data.frame(pc.results$scores)
pca1<-cbind(pca1,pcscore)

#The functions below allow you to output some of the parameters above
#For eacy polotting

#Everything between this comment frame is probably skippable

##############################################################

e<-eigen(R) #solving for the eigenvalues and eigenvectors from the correlation matrix
str(e)
## List of 2
## $ values : num [1:9] 2.665 2.47 2.36 0.342 0.301 ...
## $ vectors: num [1:9, 1:9] 0.406 0.403 0.425 -0.155 -0.192 ...
## - attr(*, "class")= chr "eigen"
L<-e$values #placing the eigenvalues in L
Vm<-matrix(0,nrow=p,ncol=p) #creating a p x p matrix with zeroes.
#Vm is an orthogonal matrix since all correlations between variable are 0.
diag(Vm)<-L #putting the eigenvalues in the diagonals
Vm #check-- matrix with eigenvalues on the diagonals
##           [,1]     [,2]     [,3]      [,4]      [,5]      [,6]     [,7]
## [1,] 2.664908 0.000000 0.000000 0.0000000 0.0000000 0.0000000 0.000000
## [2,] 0.000000 2.470263 0.000000 0.0000000 0.0000000 0.0000000 0.000000
## [3,] 0.000000 0.000000 2.360122 0.0000000 0.0000000 0.0000000 0.000000
## [4,] 0.000000 0.000000 0.000000 0.3422123 0.0000000 0.0000000 0.000000
## [5,] 0.000000 0.000000 0.000000 0.0000000 0.3006888 0.0000000 0.000000
## [6,] 0.000000 0.000000 0.000000 0.0000000 0.0000000 0.2879429 0.000000
## [7,] 0.000000 0.000000 0.000000 0.0000000 0.0000000 0.0000000 0.205228
## [8,] 0.000000 0.000000 0.000000 0.0000000 0.0000000 0.0000000 0.000000
## [9,] 0.000000 0.000000 0.000000 0.0000000 0.0000000 0.0000000 0.000000
## [,8] [,9]
## [1,] 0.0000000 0.0000000
## [2,] 0.0000000 0.0000000
## [3,] 0.0000000 0.0000000
## [4,] 0.0000000 0.0000000
## [5,] 0.0000000 0.0000000
## [6,] 0.0000000 0.0000000
## [7,] 0.0000000 0.0000000
## [8,] 0.1922097 0.0000000
## [9,] 0.0000000 0.1764255
loadings<-e$vectors %*% sqrt(Vm) #these are the correlations as we have shown earlier
loadings #signs are just different
##             [,1]        [,2]       [,3]         [,4]        [,5]         [,6]
## [1,] 0.6632900 0.07596723 -0.6236042 0.004241734 0.30617154 -0.125419628
## [2,] 0.6573322 0.10337880 -0.6024657 0.005706841 -0.38414234 0.182289564
## [3,] 0.6940877 0.12853139 -0.6146995 -0.013324172 0.05994492 -0.072886777
## [4,] -0.2526727 -0.78427368 -0.3755914 -0.323318980 0.06481053 0.112650287
## [5,] -0.3126989 -0.71739360 -0.4093575 0.433855736 -0.05263206 -0.121662940
## [6,] -0.2509482 -0.81580614 -0.3770816 -0.096550408 -0.02228656 0.006716692
## [7,] 0.5920990 -0.46434185 0.5240350 -0.106063871 -0.12823737 -0.234750188
## [8,] 0.6113308 -0.44912931 0.4752489 0.168739919 0.17261716 0.365973684
## [9,] 0.6018349 -0.47399959 0.5283105 -0.013174838 -0.04561100 -0.130726431
## [,7] [,8] [,9]
## [1,] -0.13632565 0.039403021 -0.18912116
## [2,] -0.05763089 -0.006593149 -0.10034974
## [3,] 0.18937052 -0.034745257 0.27877355
## [4,] -0.05721039 -0.234749112 0.01402038
## [5,] -0.04622106 -0.107916298 0.02017773
## [6,] 0.10811096 0.327745318 -0.01805650
## [7,] -0.23992810 0.053658078 0.12627781
## [8,] -0.03854974 0.037240764 0.06013688
## [9,] 0.26704274 -0.104739228 -0.17994906
sign <- vector(mode = "numeric", length = p)
sign <- sign(colSums(loadings))
#generates a -1 if total is a negative number and keeps it at +1 if > 0.
#This is then applied to the loadings to flip the signs. See below:
loadings2<-loadings %*% diag(sign)
loadings2
##             [,1]        [,2]       [,3]         [,4]        [,5]         [,6]
## [1,] 0.6632900 -0.07596723 0.6236042 0.004241734 -0.30617154 0.125419628
## [2,] 0.6573322 -0.10337880 0.6024657 0.005706841 0.38414234 -0.182289564
## [3,] 0.6940877 -0.12853139 0.6146995 -0.013324172 -0.05994492 0.072886777
## [4,] -0.2526727 0.78427368 0.3755914 -0.323318980 -0.06481053 -0.112650287
## [5,] -0.3126989 0.71739360 0.4093575 0.433855736 0.05263206 0.121662940
## [6,] -0.2509482 0.81580614 0.3770816 -0.096550408 0.02228656 -0.006716692
## [7,] 0.5920990 0.46434185 -0.5240350 -0.106063871 0.12823737 0.234750188
## [8,] 0.6113308 0.44912931 -0.4752489 0.168739919 -0.17261716 -0.365973684
## [9,] 0.6018349 0.47399959 -0.5283105 -0.013174838 0.04561100 0.130726431
## [,7] [,8] [,9]
## [1,] 0.13632565 -0.039403021 -0.18912116
## [2,] 0.05763089 0.006593149 -0.10034974
## [3,] -0.18937052 0.034745257 0.27877355
## [4,] 0.05721039 0.234749112 0.01402038
## [5,] 0.04622106 0.107916298 0.02017773
## [6,] -0.10811096 -0.327745318 -0.01805650
## [7,] 0.23992810 -0.053658078 0.12627781
## [8,] 0.03854974 -0.037240764 0.06013688
## [9,] -0.26704274 0.104739228 -0.17994906
L/p #this shows you what percent of overall variance each component accounts for, the first 
## [1] 0.29610088 0.27447367 0.26223574 0.03802359 0.03340987 0.03199366 0.02280311
## [8] 0.02135664 0.01960284
### Scree plots and other plots

#One form of scree plot; fancier to follow
e<-eigen(R)
#solving for the eigenvalues and eigenvectors from the correlation matrix
str(e)
## List of 2
## $ values : num [1:9] 2.665 2.47 2.36 0.342 0.301 ...
## $ vectors: num [1:9, 1:9] 0.406 0.403 0.425 -0.155 -0.192 ...
## - attr(*, "class")= chr "eigen"
L<-e$values #placing the eigenvalues in L
plot(L,main="Scree Plot",ylab="Eigenvalues",xlab="Component number",type='b')
abline(h=1, lty=2)

### The next bunch of images are fancier scree plots

library("factoextra")
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
head(pca1)
##   ï..const VAR00001 VAR00002 VAR00003 VAR00004 VAR00005 VAR00006 VAR00007
## 1 1 51.21346 45.64752 47.00907 45.31986 37.04117 42.25370 49.42760
## 2 1 36.87658 39.37590 40.37346 61.29191 59.05488 56.30463 49.77283
## 3 1 34.22846 40.95423 44.51697 24.13848 37.21853 41.79234 41.29289
## 4 1 49.71161 47.51437 43.35686 42.71637 47.56493 45.29507 49.82437
## 5 1 38.80370 42.88313 39.37815 55.35847 57.54772 52.60606 60.10324
## 6 1 56.16847 44.82923 54.10363 56.44973 46.21895 50.90457 65.33737
## VAR00008 VAR00009 PC1 PC2 PC3 PC4 PC5
## 1 51.59151 52.96864 0.2968874 -0.9528046 -0.9735157 -1.36363129 -1.126847392
## 2 48.41863 47.49123 -1.6104657 1.1445588 -0.4000572 0.30598695 -0.000865611
## 3 31.66971 39.46853 -1.5217140 -2.6366236 -0.8108132 -0.36460074 1.301929921
## 4 40.47604 50.41910 -0.4822639 -0.7615298 -0.3990909 -0.07416445 0.696716960
## 5 51.19566 54.24683 -0.7774072 1.1460709 -1.0513657 0.62134998 0.870803674
## 6 60.89935 62.72507 1.3131128 1.0340728 -0.8988576 -1.08413037 -1.769264801
## PC6 PC7 PC8 PC9
## 1 -0.3689145 -0.2021500 0.64975498 -1.1264236
## 2 0.1161549 0.6453512 0.09348886 0.2642307
## 3 1.9585284 -0.2982362 -1.99199148 1.0268382
## 4 1.8282230 0.6182202 0.32165278 -2.0345053
## 5 0.8435864 0.9917074 0.47663628 -0.7583452
## 6 0.6342123 -1.1163380 0.60056804 0.2636299
#Running the pca via factoextra, needed to feed the plotting functions
(res.pca <- prcomp(pca1[2:10], scale = TRUE))
## Standard deviations (1, .., p=9):
## [1] 1.6324546 1.5717071 1.5362688 0.5849892 0.5483510 0.5366031 0.4530210
## [8] 0.4384173 0.4200304
##
## Rotation (n x k) = (9 x 9):
## PC1 PC2 PC3 PC4 PC5 PC6
## VAR00001 -0.4063145 0.04833422 -0.4059213 0.007250961 -0.55834953 0.23372884
## VAR00002 -0.4026649 0.06577485 -0.3921617 0.009755465 0.70054095 -0.33971022
## VAR00003 -0.4251804 0.08177821 -0.4001250 -0.022776785 -0.10931852 0.13582995
## VAR00004 0.1547809 -0.49899482 -0.2444829 -0.552692257 -0.11819168 -0.20993223
## VAR00005 0.1915514 -0.45644231 -0.2664622 0.741647478 0.09598242 0.22672798
## VAR00006 0.1537245 -0.51905738 -0.2454529 -0.165046490 0.04064288 -0.01251706
## VAR00007 -0.3627047 -0.29543791 0.3411089 -0.181309121 0.23385999 0.43747451
## VAR00008 -0.3744857 -0.28575892 0.3093527 0.288449650 -0.31479318 -0.68201930
## VAR00009 -0.3686687 -0.30158266 0.3438920 -0.022521507 0.08317847 0.24361847
## PC7 PC8 PC9
## VAR00001 -0.30092570 0.08987560 0.45025590
## VAR00002 -0.12721463 -0.01503852 0.23891068
## VAR00003 0.41801712 -0.07925156 -0.66369854
## VAR00004 -0.12628640 -0.53544673 -0.03337944
## VAR00005 -0.10202853 -0.24614972 -0.04803873
## VAR00006 0.23864449 0.74756473 0.04298857
## VAR00007 -0.52961810 0.12239042 -0.30063972
## VAR00008 -0.08509482 0.08494365 -0.14317269
## VAR00009 0.58947105 -0.23890304 0.42841916
# Extract the  eigenvalues/variances
get_eig(res.pca)
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.6649079 29.610088 29.61009
## Dim.2 2.4702631 27.447367 57.05746
## Dim.3 2.3601217 26.223574 83.28103
## Dim.4 0.3422123 3.802359 87.08339
## Dim.5 0.3006888 3.340987 90.42438
## Dim.6 0.2879429 3.199366 93.62374
## Dim.7 0.2052280 2.280311 95.90405
## Dim.8 0.1922097 2.135664 98.03972
## Dim.9 0.1764255 1.960284 100.00000
# Default plot
fviz_eig(res.pca)

# Add data labels
fviz_eig(res.pca, addlabels=TRUE, hjust = -0.3)

# Change the y axis limits
fviz_eig(res.pca, addlabels=TRUE, hjust = -0.3) +
ylim(0, 40)

# Scree plot - Eigenvalues
fviz_eig(res.pca, choice = "eigenvalue",
addlabels=TRUE, hjust=-0.3) +
ylim(0,5)

# Use only lineplot
fviz_eig(res.pca, geom="line", choice = "eigenvalue",
addlabels=TRUE, hjust=-0.3) +
ylim(0,5)