#Regression outlier diagnostics
#The function references an lm regression object, here steptotal2

library(olsrr)
ols_plot_cooksd_bar(steptotal2)

ols_plot_cooksd_chart(steptotal2)

ols_plot_dffits(steptotal2)

ols_plot_dfbetas(steptotal2)

showcases<-data.frame(dfbetas(steptotal2))
showcases$ID<-rownames(showcases)
showcases_final<-subset(showcases,gender>.17 |depress>.17 | ds_cor>.17 |
vo_cor>.17| menthlth>.17 |anxiety >.17)
showcases_final
##     X.Intercept.      gender     depress       ds_cor       vo_cor    menthlth
## 2 -0.06127439 -0.07653325 0.09426396 0.068547483 0.187683669 -0.02628661
## 5 -0.18073401 0.19272968 0.02483105 -0.014278203 0.013642992 0.03425968
## 15 -0.33009395 0.25174443 0.06544861 0.039773242 0.094671410 0.09945893
## 27 0.21746623 -0.34379863 0.30947776 0.157581219 -0.094028375 0.01140294
## 30 0.06136390 -0.02237952 0.01514400 -0.155908762 0.184106086 -0.29748190
## 34 -0.26062954 0.20930583 0.14502117 0.123729446 0.038225682 -0.17912154
## 45 -0.18696372 0.03201440 -0.01363296 0.227666641 -0.111982248 0.31722706
## 47 0.06002788 0.07262708 -0.11930984 -0.146384391 -0.142229912 0.35914005
## 58 -0.03503247 0.06989579 0.17871914 -0.073739205 0.119286893 -0.10648030
## 64 -0.04532458 -0.05082022 0.06045690 0.190102321 -0.001355281 -0.10936855
## 67 -0.02255921 0.06115798 -0.33960125 -0.079721025 -0.040528475 0.33038254
## 89 -0.02089405 -0.07664062 0.17867736 0.181008090 -0.108972977 0.04843759
## 95 0.01330455 -0.24168950 0.11306492 0.142131377 0.161493771 0.07918421
## 98 -0.21157018 0.22885813 0.07468030 0.029474067 -0.074890879 -0.01721209
## 101 0.22630830 -0.10247629 -0.16664817 -0.041640652 -0.386420839 0.33088521
## 104 0.01234059 -0.11736650 0.28058839 0.008768735 0.160419235 -0.31808921
## 118 -0.18237342 0.06624524 -0.13901927 0.165005954 0.074393003 0.18574060
## 127 -0.03971225 0.09054762 -0.10786185 -0.106019974 0.239676564 -0.07809147
## 131 0.07506996 -0.02905800 -0.10185296 -0.254776389 0.184824312 -0.01460179
## 136 -0.15610122 0.06346904 -0.01949422 0.100627807 0.179095722 -0.04675941
## 138 -0.12469988 0.17085115 -0.01299457 -0.010440285 -0.059281441 0.01848149
## 141 -0.03275996 0.02612055 0.26165238 0.070861406 -0.082608458 -0.15938658
## 144 -0.06457321 0.01562712 0.17642106 0.038363898 0.009023344 0.03468457
## anxiety ID
## 2 -0.112271982 2
## 5 -0.040349571 5
## 15 0.048634999 15
## 27 -0.133539106 27
## 30 0.208033602 30
## 34 0.148807408 34
## 45 0.201560803 45
## 47 -0.323731565 47
## 58 -0.069985576 58
## 64 -0.018346832 64
## 67 0.154381109 67
## 89 -0.242441616 89
## 95 0.204512742 95
## 98 0.049695131 98
## 101 -0.550817996 101
## 104 0.078178590 104
## 118 0.176788185 118
## 127 -0.004963135 127
## 131 0.078522339 131
## 136 0.220185010 136
## 138 -0.039100496 138
## 141 0.102264483 141
## 144 -0.026342339 144
ols_plot_resid_stud(steptotal2)

ols_plot_resid_stand(steptotal2)

ols_plot_resid_lev(steptotal2)

# Mahalanobi's outlier detection

#install.packages('mvoutlier')
library(mvoutlier)
## Loading required package: sgeostat
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## sROC 0.1-2 loaded
# Requires a non-missing data set

# Sign function is a quick function to flag outliers into a list
# Note that the mahalanobis it generates is not squared, but we get a critical
# value of the raw mahalanobis called const.

#This only wants continuous variables, so I had to drop gender and anxiety (because
#of range restriction)

signout<-(sign1(Wk11b2_nomiss[,c(2,4,5,6)], makeplot = TRUE, qcrit = 0.95))

# We need to transpose and turn the signout list into an object we can merge with our data

#install.packages('devtools')
library('devtools')
## Loading required package: usethis
#The first step gives us a data frame

signout2 <-as.data.frame(signout)

# Next, we merge the three variables from the outlier flagging into our data fram

Wk11b2_nomiss$outflag<-signout2$wfinal01
Wk11b2_nomiss$mahdist<-signout2$x.dist
Wk11b2_nomiss$const2<-signout2$const

#We can get a plot of the mahalanobis

# Histogram
# Requires complete cases
# Adding a vertical line that marks the cutoff for outliers

h<-hist(Wk11b2_nomiss$mahdist,breaks=25,col="lightblue", xlab="Mahalanobi's Distance",
main="Histogram of Mahalanobi's distance")
abline(v=Wk11b2_nomiss$const2,col="red")

library(plyr)

#Shows us exactly how many cases got flagged as outliers

plyr::count(Wk11b2_nomiss$outflag)
##   x freq
## 1 0 15
## 2 1 131
# Produces a subset of outliers and lists their data so we can assess *why*
# Note, it might be easier to identify these as outliers if you put variable in z-score
Wk11b2_nomiss$ID<-rownames(Wk11b2_nomiss)
newdata <- Wk11b2_nomiss[ which(Wk11b2_nomiss$outflag==0), ]
print(newdata)
## # A tibble: 15 x 11
## gender depress avlt1 ds_cor vo_cor menthlth anxiety outflag mahdist const2
## <dbl+l> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 [Fem~ 26 36 38 7 25 34.3 0 4.61 2.80
## 2 2 [Fem~ 11 60 43 4 57 101 0 3.18 2.80
## 3 2 [Fem~ 17 40 23 9 37 1 0 2.80 2.80
## 4 2 [Fem~ 8.42 38 22 9 65 101 0 3.40 2.80
## 5 2 [Fem~ 19 37 30 9 41 1 0 3.35 2.80
## 6 2 [Fem~ 0 52 23 3 53 101 0 3.18 2.80
## 7 2 [Fem~ 0 40 29 5 49 101 0 2.83 2.80
## 8 2 [Fem~ 20 51 27 10 45 34.3 0 3.64 2.80
## 9 2 [Fem~ 23 37 33 4 17 1 0 3.86 2.80
## 10 2 [Fem~ 20 45 31 8 25 1 0 3.39 2.80
## 11 1 [Mal~ 14.7 57 52 12 45 101 0 3.81 2.80
## 12 2 [Fem~ 9 58 74 7 37 1 0 3.28 2.80
## 13 2 [Fem~ 30 53 43 8 61 1 0 5.91 2.80
## 14 2 [Fem~ 22 48 32 3 17 67.7 0 3.58 2.80
## 15 2 [Fem~ 23 52 36 7 41 34.3 0 4.00 2.80
## # ... with 1 more variable: ID <chr>