library(haven)
## Warning: package 'haven' was built under R version 3.4.1
whas500 <- read_sas("~/BiostatCourses/PublicHealthComputing/Lectures/Week11Survival/SAS/whas500.sas7bdat")
whas500$GENDER <- factor(whas500$GENDER,levels=c(0,1),labels=c("male","female"))
head(whas500)
## # A tibble: 6 x 19
## ID AGE GENDER HR SYSBP DIASBP BMI CVD AFB SHO CHF
## <dbl> <dbl> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 83 male 89 152 78 25.54051 1 1 0 0
## 2 2 49 male 84 120 60 24.02398 1 0 0 0
## 3 3 70 female 83 147 88 22.14290 0 0 0 0
## 4 4 70 male 65 123 76 26.63187 1 0 0 1
## 5 5 70 male 63 135 85 24.41255 1 0 0 0
## 6 6 70 male 76 83 54 23.24236 1 0 0 0
## # ... with 8 more variables: AV3 <dbl>, MIORD <dbl>, MITYPE <dbl>,
## # YEAR <dbl>, LOS <dbl>, DSTAT <dbl>, LENFOL <dbl>, FSTAT <dbl>
In R, survival analysis functions are implemented in the survival package. The Surv function is used to define the censored times and can be enterd in two ways
library(survival)
## Warning: package 'survival' was built under R version 3.4.2
whas500.surv <- with(whas500,Surv(LENFOL,FSTAT))
str(whas500.surv)
## Surv [1:500, 1:2] 2178+ 2172+ 2190+ 297 2131+ 1 2122+ 1496 920 2175+ ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:2] "time" "status"
## - attr(*, "type")= chr "right"
whas500.fit1 <- survfit(whas500.surv~1)
str(whas500.fit1)
## List of 13
## $ n : int 500
## $ time : num [1:395] 1 2 3 4 5 6 7 10 11 14 ...
## $ n.risk : num [1:395] 500 492 484 481 479 477 472 466 463 459 ...
## $ n.event : num [1:395] 8 8 3 2 2 5 6 3 4 2 ...
## $ n.censor : num [1:395] 0 0 0 0 0 0 0 0 0 0 ...
## $ surv : num [1:395] 0.984 0.968 0.962 0.958 0.954 0.944 0.932 0.926 0.918 0.914 ...
## $ type : chr "right"
## $ std.err : num [1:395] 0.0057 0.00813 0.00889 0.00936 0.00982 ...
## $ upper : num [1:395] 0.995 0.984 0.979 0.976 0.973 ...
## $ lower : num [1:395] 0.973 0.953 0.945 0.941 0.936 ...
## $ conf.type: chr "log"
## $ conf.int : num 0.95
## $ call : language survfit(formula = whas500.surv ~ 1)
## - attr(*, "class")= chr "survfit"
plot(whas500.fit1, main="Kaplan-Meier Survival Function Estimate",
xlab="Length of Follow Up", ylab="Survival Probability", mark.time=T)
library(bshazard) #data driven nonparametric smoothing estimate of the hazard function
## Warning: package 'bshazard' was built under R version 3.4.2
## Loading required package: splines
## Loading required package: Epi
## Warning: package 'Epi' was built under R version 3.4.2
##
## Attaching package: 'Epi'
## The following object is masked from 'package:base':
##
## merge.data.frame
plot(bshazard(Surv(rep(0,length(whas500$LENFOL)),whas500$LENFOL,whas500$FSTAT)~1))
## Iterations: relative error in phi-hat = 1e-04
## phi= 1.656804 sv2= 0.1807633 df= 9.372665 lambda= 9.165601
## phi= 1.651307 sv2= 0.1880969 df= 9.541913 lambda= 8.779023
## phi= 1.648771 sv2= 0.1918277 df= 9.626732 lambda= 8.595064
## phi= 1.647567 sv2= 0.193688 df= 9.668688 lambda= 8.506295
## phi= 1.646987 sv2= 0.1946066 df= 9.689322 lambda= 8.463159
## phi= 1.646705 sv2= 0.1950581 df= 9.699443 lambda= 8.442124
## phi= 1.646567 sv2= 0.1952795 df= 9.7044 lambda= 8.43185
## phi= 1.6465 sv2= 0.1953879 df= 9.706828 lambda= 8.426827
## phi= 1.646467 sv2= 0.195441 df= 9.708015 lambda= 8.42437
## phi= 1.646451 sv2= 0.195467 df= 9.708597 lambda= 8.423168
## phi= 1.646443 sv2= 0.1954797 df= 9.708881 lambda= 8.42258
whas500.fit2 <- survfit(Surv(LENFOL,FSTAT)~GENDER,data=whas500)
plot(whas500.fit2, col=c("red","blue"), lty=c(1,2), conf.int=TRUE, mark.time=T)
legend("topright", legend=c("male","female"),col=c("red","blue"),lty=c(1,2))
survdiff(Surv(LENFOL,FSTAT)~GENDER,data=whas500) #log-rank test
## Call:
## survdiff(formula = Surv(LENFOL, FSTAT) ~ GENDER, data = whas500)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## GENDER=male 300 111 130.7 2.98 7.79
## GENDER=female 200 104 84.3 4.62 7.79
##
## Chisq= 7.8 on 1 degrees of freedom, p= 0.00525
In the survival package, we cna use the function coxph to fit this model
whas500.fit3 <- coxph(Surv(LENFOL,FSTAT)~GENDER*AGE + BMI + I(BMI^2) + HR,data=whas500)
summary(whas500.fit3)
## Call:
## coxph(formula = Surv(LENFOL, FSTAT) ~ GENDER * AGE + BMI + I(BMI^2) +
## HR, data = whas500)
##
## n= 500, number of events= 215
##
## coef exp(coef) se(coef) z Pr(>|z|)
## GENDERfemale 2.113235 8.274964 0.993604 2.127 0.03343 *
## AGE 0.070961 1.073539 0.008354 8.494 < 2e-16 ***
## BMI -0.233897 0.791443 0.087949 -2.659 0.00783 **
## I(BMI^2) 0.003644 1.003650 0.001644 2.216 0.02668 *
## HR 0.012782 1.012864 0.002758 4.635 3.58e-06 ***
## GENDERfemale:AGE -0.029304 0.971121 0.012517 -2.341 0.01923 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## GENDERfemale 8.2750 0.1208 1.1803 58.0128
## AGE 1.0735 0.9315 1.0561 1.0913
## BMI 0.7914 1.2635 0.6661 0.9403
## I(BMI^2) 1.0037 0.9964 1.0004 1.0069
## HR 1.0129 0.9873 1.0074 1.0184
## GENDERfemale:AGE 0.9711 1.0297 0.9476 0.9952
##
## Concordance= 0.759 (se = 0.021 )
## Rsquare= 0.302 (max possible= 0.993 )
## Likelihood ratio test= 179.5 on 6 df, p=0
## Wald test = 155.4 on 6 df, p=0
## Score (logrank) test = 175.3 on 6 df, p=0
str(whas500.fit3)
## List of 20
## $ coefficients : Named num [1:6] 2.11323 0.07096 -0.2339 0.00364 0.01278 ...
## ..- attr(*, "names")= chr [1:6] "GENDERfemale" "AGE" "BMI" "I(BMI^2)" ...
## $ var : num [1:6, 1:6] 9.87e-01 4.90e-03 1.11e-02 -2.17e-04 -3.39e-05 ...
## $ loglik : num [1:2] -1227 -1138
## $ score : num 175
## $ iter : int 5
## $ linear.predictors: num [1:500] 0.93 -1.466 0.197 -0.347 -0.266 ...
## $ residuals : Named num [1:500] -1.758 -0.16 -0.845 0.859 -0.463 ...
## ..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
## $ means : Named num [1:6] 0.4 69.8 26.6 737.5 87 ...
## ..- attr(*, "names")= chr [1:6] "GENDERfemale" "AGE" "BMI" "I(BMI^2)" ...
## $ concordance : Named num [1:5] 57008 18141 0 119 3162
## ..- attr(*, "names")= chr [1:5] "concordant" "discordant" "tied.risk" "tied.time" ...
## $ method : chr "efron"
## $ n : int 500
## $ nevent : num 215
## $ terms :Classes 'terms', 'formula' language Surv(LENFOL, FSTAT) ~ GENDER * AGE + BMI + I(BMI^2) + HR
## .. ..- attr(*, "variables")= language list(Surv(LENFOL, FSTAT), GENDER, AGE, BMI, I(BMI^2), HR)
## .. ..- attr(*, "factors")= int [1:6, 1:6] 0 1 0 0 0 0 0 0 1 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:6] "Surv(LENFOL, FSTAT)" "GENDER" "AGE" "BMI" ...
## .. .. .. ..$ : chr [1:6] "GENDER" "AGE" "BMI" "I(BMI^2)" ...
## .. ..- attr(*, "term.labels")= chr [1:6] "GENDER" "AGE" "BMI" "I(BMI^2)" ...
## .. ..- attr(*, "specials")=Dotted pair list of 3
## .. .. ..$ strata : NULL
## .. .. ..$ cluster: NULL
## .. .. ..$ tt : NULL
## .. ..- attr(*, "order")= int [1:6] 1 1 1 1 1 2
## .. ..- attr(*, "intercept")= num 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(Surv(LENFOL, FSTAT), GENDER, AGE, BMI, I(BMI^2), HR)
## .. ..- attr(*, "dataClasses")= Named chr [1:6] "nmatrix.2" "factor" "numeric" "numeric" ...
## .. .. ..- attr(*, "names")= chr [1:6] "Surv(LENFOL, FSTAT)" "GENDER" "AGE" "BMI" ...
## $ assign :List of 6
## ..$ GENDER : int 1
## ..$ AGE : int 2
## ..$ BMI : int 3
## ..$ I(BMI^2) : int 4
## ..$ HR : int 5
## ..$ GENDER:AGE: int 6
## $ wald.test : num 155
## $ y : Surv [1:500, 1:2] 2178+ 2172+ 2190+ 297 2131+ 1 2122+ 1496 920 2175+ ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:500] "1" "2" "3" "4" ...
## .. ..$ : chr [1:2] "time" "status"
## ..- attr(*, "type")= chr "right"
## $ formula :Class 'formula' language Surv(LENFOL, FSTAT) ~ GENDER * AGE + BMI + I(BMI^2) + HR
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## $ xlevels :List of 1
## ..$ GENDER: chr [1:2] "male" "female"
## $ contrasts :List of 1
## ..$ GENDER: chr "contr.treatment"
## $ call : language coxph(formula = Surv(LENFOL, FSTAT) ~ GENDER * AGE + BMI + I(BMI^2) + HR, data = whas500)
## - attr(*, "class")= chr "coxph"
The function cox.zph provides a test of the proportional hazard assumption for each covariate in the model providing both p-values and diagnostic plots using the Schoenfeld residuals. these plots, if the proportional hazard assumption holds, then the scatterplot smoother should provide a line of roughly y=0.
whas500.diag <- cox.zph(whas500.fit3)
print(whas500.diag)
## rho chisq p
## GENDERfemale 0.0696 1.067 0.302
## AGE 0.0877 1.569 0.210
## BMI 0.0447 0.467 0.494
## I(BMI^2) -0.0331 0.264 0.607
## HR 0.0722 1.045 0.307
## GENDERfemale:AGE -0.0682 1.033 0.309
## GLOBAL NA 3.754 0.710
plot(whas500.diag)
We can also extract the residuals directly and look at the residuals vs the covariate to asses the functional form of the model. The default residuals are the martingale residuals from the cox ph model, but you can also get the deviance residuals, Schoenfeld residuals (type=“scaledsch”), and the score residuals (type=“dfbeta” or for scaled score residuals type=“dfbetas”).
whas500.resid <- residuals(whas500.fit3) #martingale residuals
par(mfrow=c(2,2))
scatter.smooth(whas500$BMI,whas500.resid, span=0.2, main="Smooth=0.2")
scatter.smooth(whas500$BMI,whas500.resid, span=0.4, main="Smooth=0.4")
scatter.smooth(whas500$BMI,whas500.resid, span=0.6, main="Smooth=0.6")
scatter.smooth(whas500$BMI,whas500.resid, span=0.8, main="Smooth=0.8")
Here are the schoenfeld residuals plot for assessing the proportional hazard assumption for age. In this case, the residuals function returns a matrix with one row for each observed event (an observation that was not censored) and a column for each variable in the model (see the output from summary of model for the order of the columns). The second column refers to the Schoenfeld residuals for AGE in this model.
whas500.resid <- residuals(whas500.fit3,type="scaledsch") #Schoenfeld residuals
par(mfrow=c(2,2))
scatter.smooth(whas500$LENFOL[whas500$FSTAT==1],whas500.resid[,2], span=0.2,main="Smooth=0.2")
scatter.smooth(whas500$LENFOL[whas500$FSTAT==1],whas500.resid[,2], span=0.4,main="Smooth=0.4")
scatter.smooth(whas500$LENFOL[whas500$FSTAT==1],whas500.resid[,2], span=0.6,main="Smooth=0.6")
scatter.smooth(whas500$LENFOL[whas500$FSTAT==1],whas500.resid[,2], span=0.8,main="Smooth=0.8")