Survival Analysis in R

library(haven)

## Warning: package 'haven' was built under R version 3.4.1

whas500 <- read_sas("~/BiostatCourses/PublicHealthComputing/Lectures/Week11Survival/SAS/whas500.sas7bdat")
whas500$GENDER <- factor(whas500$GENDER,levels=c(0,1),labels=c("male","female"))
head(whas500)

## # A tibble: 6 x 19
##      ID   AGE GENDER    HR SYSBP DIASBP      BMI   CVD   AFB   SHO   CHF
##   <dbl> <dbl> <fctr> <dbl> <dbl>  <dbl>    <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1    83   male    89   152     78 25.54051     1     1     0     0
## 2     2    49   male    84   120     60 24.02398     1     0     0     0
## 3     3    70 female    83   147     88 22.14290     0     0     0     0
## 4     4    70   male    65   123     76 26.63187     1     0     0     1
## 5     5    70   male    63   135     85 24.41255     1     0     0     0
## 6     6    70   male    76    83     54 23.24236     1     0     0     0
## # ... with 8 more variables: AV3 <dbl>, MIORD <dbl>, MITYPE <dbl>,
## #   YEAR <dbl>, LOS <dbl>, DSTAT <dbl>, LENFOL <dbl>, FSTAT <dbl>

In R, survival analysis functions are implemented in the survival package. The Surv function is used to define the censored times and can be enterd in two ways

Surv(time,event): time is the length of follow up and event is the even indicator where 0 = censored and 1 = the event occurred
Surve(start, end, event): start is the start time, end is th end time (so total follow up time is end-start), event is the same censoring variable

library(survival)

## Warning: package 'survival' was built under R version 3.4.2

whas500.surv <- with(whas500,Surv(LENFOL,FSTAT))
str(whas500.surv)

##  Surv [1:500, 1:2] 2178+ 2172+ 2190+  297  2131+    1  2122+ 1496   920  2175+ ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:2] "time" "status"
##  - attr(*, "type")= chr "right"

whas500.fit1 <- survfit(whas500.surv~1)
str(whas500.fit1)

## List of 13
##  $ n        : int 500
##  $ time     : num [1:395] 1 2 3 4 5 6 7 10 11 14 ...
##  $ n.risk   : num [1:395] 500 492 484 481 479 477 472 466 463 459 ...
##  $ n.event  : num [1:395] 8 8 3 2 2 5 6 3 4 2 ...
##  $ n.censor : num [1:395] 0 0 0 0 0 0 0 0 0 0 ...
##  $ surv     : num [1:395] 0.984 0.968 0.962 0.958 0.954 0.944 0.932 0.926 0.918 0.914 ...
##  $ type     : chr "right"
##  $ std.err  : num [1:395] 0.0057 0.00813 0.00889 0.00936 0.00982 ...
##  $ upper    : num [1:395] 0.995 0.984 0.979 0.976 0.973 ...
##  $ lower    : num [1:395] 0.973 0.953 0.945 0.941 0.936 ...
##  $ conf.type: chr "log"
##  $ conf.int : num 0.95
##  $ call     : language survfit(formula = whas500.surv ~ 1)
##  - attr(*, "class")= chr "survfit"

plot(whas500.fit1, main="Kaplan-Meier Survival Function Estimate",
     xlab="Length of Follow Up", ylab="Survival Probability", mark.time=T)

library(bshazard) #data driven nonparametric smoothing estimate of the hazard function

## Warning: package 'bshazard' was built under R version 3.4.2

## Loading required package: splines

## Loading required package: Epi

## Warning: package 'Epi' was built under R version 3.4.2

## 
## Attaching package: 'Epi'

## The following object is masked from 'package:base':
## 
##     merge.data.frame

plot(bshazard(Surv(rep(0,length(whas500$LENFOL)),whas500$LENFOL,whas500$FSTAT)~1))

## Iterations: relative error in phi-hat = 1e-04 
## phi= 1.656804   sv2= 0.1807633   df= 9.372665   lambda= 9.165601 
## phi= 1.651307   sv2= 0.1880969   df= 9.541913   lambda= 8.779023 
## phi= 1.648771   sv2= 0.1918277   df= 9.626732   lambda= 8.595064 
## phi= 1.647567   sv2= 0.193688   df= 9.668688   lambda= 8.506295 
## phi= 1.646987   sv2= 0.1946066   df= 9.689322   lambda= 8.463159 
## phi= 1.646705   sv2= 0.1950581   df= 9.699443   lambda= 8.442124 
## phi= 1.646567   sv2= 0.1952795   df= 9.7044   lambda= 8.43185 
## phi= 1.6465   sv2= 0.1953879   df= 9.706828   lambda= 8.426827 
## phi= 1.646467   sv2= 0.195441   df= 9.708015   lambda= 8.42437 
## phi= 1.646451   sv2= 0.195467   df= 9.708597   lambda= 8.423168 
## phi= 1.646443   sv2= 0.1954797   df= 9.708881   lambda= 8.42258

Comparing Survival Functions

whas500.fit2 <- survfit(Surv(LENFOL,FSTAT)~GENDER,data=whas500)
plot(whas500.fit2, col=c("red","blue"), lty=c(1,2), conf.int=TRUE, mark.time=T)
legend("topright", legend=c("male","female"),col=c("red","blue"),lty=c(1,2))

survdiff(Surv(LENFOL,FSTAT)~GENDER,data=whas500) #log-rank test

## Call:
## survdiff(formula = Surv(LENFOL, FSTAT) ~ GENDER, data = whas500)
## 
##                 N Observed Expected (O-E)^2/E (O-E)^2/V
## GENDER=male   300      111    130.7      2.98      7.79
## GENDER=female 200      104     84.3      4.62      7.79
## 
##  Chisq= 7.8  on 1 degrees of freedom, p= 0.00525

Cox’s Proportional Hazard Regression Model

In the survival package, we cna use the function coxph to fit this model

whas500.fit3 <- coxph(Surv(LENFOL,FSTAT)~GENDER*AGE + BMI + I(BMI^2) + HR,data=whas500)
summary(whas500.fit3)

## Call:
## coxph(formula = Surv(LENFOL, FSTAT) ~ GENDER * AGE + BMI + I(BMI^2) + 
##     HR, data = whas500)
## 
##   n= 500, number of events= 215 
## 
##                       coef exp(coef)  se(coef)      z Pr(>|z|)    
## GENDERfemale      2.113235  8.274964  0.993604  2.127  0.03343 *  
## AGE               0.070961  1.073539  0.008354  8.494  < 2e-16 ***
## BMI              -0.233897  0.791443  0.087949 -2.659  0.00783 ** 
## I(BMI^2)          0.003644  1.003650  0.001644  2.216  0.02668 *  
## HR                0.012782  1.012864  0.002758  4.635 3.58e-06 ***
## GENDERfemale:AGE -0.029304  0.971121  0.012517 -2.341  0.01923 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                  exp(coef) exp(-coef) lower .95 upper .95
## GENDERfemale        8.2750     0.1208    1.1803   58.0128
## AGE                 1.0735     0.9315    1.0561    1.0913
## BMI                 0.7914     1.2635    0.6661    0.9403
## I(BMI^2)            1.0037     0.9964    1.0004    1.0069
## HR                  1.0129     0.9873    1.0074    1.0184
## GENDERfemale:AGE    0.9711     1.0297    0.9476    0.9952
## 
## Concordance= 0.759  (se = 0.021 )
## Rsquare= 0.302   (max possible= 0.993 )
## Likelihood ratio test= 179.5  on 6 df,   p=0
## Wald test            = 155.4  on 6 df,   p=0
## Score (logrank) test = 175.3  on 6 df,   p=0

str(whas500.fit3)

## List of 20
##  $ coefficients     : Named num [1:6] 2.11323 0.07096 -0.2339 0.00364 0.01278 ...
##   ..- attr(*, "names")= chr [1:6] "GENDERfemale" "AGE" "BMI" "I(BMI^2)" ...
##  $ var              : num [1:6, 1:6] 9.87e-01 4.90e-03 1.11e-02 -2.17e-04 -3.39e-05 ...
##  $ loglik           : num [1:2] -1227 -1138
##  $ score            : num 175
##  $ iter             : int 5
##  $ linear.predictors: num [1:500] 0.93 -1.466 0.197 -0.347 -0.266 ...
##  $ residuals        : Named num [1:500] -1.758 -0.16 -0.845 0.859 -0.463 ...
##   ..- attr(*, "names")= chr [1:500] "1" "2" "3" "4" ...
##  $ means            : Named num [1:6] 0.4 69.8 26.6 737.5 87 ...
##   ..- attr(*, "names")= chr [1:6] "GENDERfemale" "AGE" "BMI" "I(BMI^2)" ...
##  $ concordance      : Named num [1:5] 57008 18141 0 119 3162
##   ..- attr(*, "names")= chr [1:5] "concordant" "discordant" "tied.risk" "tied.time" ...
##  $ method           : chr "efron"
##  $ n                : int 500
##  $ nevent           : num 215
##  $ terms            :Classes 'terms', 'formula'  language Surv(LENFOL, FSTAT) ~ GENDER * AGE + BMI + I(BMI^2) + HR
##   .. ..- attr(*, "variables")= language list(Surv(LENFOL, FSTAT), GENDER, AGE, BMI, I(BMI^2), HR)
##   .. ..- attr(*, "factors")= int [1:6, 1:6] 0 1 0 0 0 0 0 0 1 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:6] "Surv(LENFOL, FSTAT)" "GENDER" "AGE" "BMI" ...
##   .. .. .. ..$ : chr [1:6] "GENDER" "AGE" "BMI" "I(BMI^2)" ...
##   .. ..- attr(*, "term.labels")= chr [1:6] "GENDER" "AGE" "BMI" "I(BMI^2)" ...
##   .. ..- attr(*, "specials")=Dotted pair list of 3
##   .. .. ..$ strata : NULL
##   .. .. ..$ cluster: NULL
##   .. .. ..$ tt     : NULL
##   .. ..- attr(*, "order")= int [1:6] 1 1 1 1 1 2
##   .. ..- attr(*, "intercept")= num 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(Surv(LENFOL, FSTAT), GENDER, AGE, BMI, I(BMI^2), HR)
##   .. ..- attr(*, "dataClasses")= Named chr [1:6] "nmatrix.2" "factor" "numeric" "numeric" ...
##   .. .. ..- attr(*, "names")= chr [1:6] "Surv(LENFOL, FSTAT)" "GENDER" "AGE" "BMI" ...
##  $ assign           :List of 6
##   ..$ GENDER    : int 1
##   ..$ AGE       : int 2
##   ..$ BMI       : int 3
##   ..$ I(BMI^2)  : int 4
##   ..$ HR        : int 5
##   ..$ GENDER:AGE: int 6
##  $ wald.test        : num 155
##  $ y                : Surv [1:500, 1:2] 2178+ 2172+ 2190+  297  2131+    1  2122+ 1496   920  2175+ ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:500] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:2] "time" "status"
##   ..- attr(*, "type")= chr "right"
##  $ formula          :Class 'formula'  language Surv(LENFOL, FSTAT) ~ GENDER * AGE + BMI + I(BMI^2) + HR
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##  $ xlevels          :List of 1
##   ..$ GENDER: chr [1:2] "male" "female"
##  $ contrasts        :List of 1
##   ..$ GENDER: chr "contr.treatment"
##  $ call             : language coxph(formula = Surv(LENFOL, FSTAT) ~ GENDER * AGE + BMI + I(BMI^2) +      HR, data = whas500)
##  - attr(*, "class")= chr "coxph"

The function cox.zph provides a test of the proportional hazard assumption for each covariate in the model providing both p-values and diagnostic plots using the Schoenfeld residuals. these plots, if the proportional hazard assumption holds, then the scatterplot smoother should provide a line of roughly y=0.

whas500.diag <- cox.zph(whas500.fit3)
print(whas500.diag)

##                      rho chisq     p
## GENDERfemale      0.0696 1.067 0.302
## AGE               0.0877 1.569 0.210
## BMI               0.0447 0.467 0.494
## I(BMI^2)         -0.0331 0.264 0.607
## HR                0.0722 1.045 0.307
## GENDERfemale:AGE -0.0682 1.033 0.309
## GLOBAL                NA 3.754 0.710

plot(whas500.diag)

We can also extract the residuals directly and look at the residuals vs the covariate to asses the functional form of the model. The default residuals are the martingale residuals from the cox ph model, but you can also get the deviance residuals, Schoenfeld residuals (type=“scaledsch”), and the score residuals (type=“dfbeta” or for scaled score residuals type=“dfbetas”).

whas500.resid <- residuals(whas500.fit3) #martingale residuals
par(mfrow=c(2,2))
scatter.smooth(whas500$BMI,whas500.resid, span=0.2, main="Smooth=0.2")
scatter.smooth(whas500$BMI,whas500.resid, span=0.4, main="Smooth=0.4")
scatter.smooth(whas500$BMI,whas500.resid, span=0.6, main="Smooth=0.6")
scatter.smooth(whas500$BMI,whas500.resid, span=0.8, main="Smooth=0.8")

Here are the schoenfeld residuals plot for assessing the proportional hazard assumption for age. In this case, the residuals function returns a matrix with one row for each observed event (an observation that was not censored) and a column for each variable in the model (see the output from summary of model for the order of the columns). The second column refers to the Schoenfeld residuals for AGE in this model.

whas500.resid <- residuals(whas500.fit3,type="scaledsch") #Schoenfeld residuals
par(mfrow=c(2,2))
scatter.smooth(whas500$LENFOL[whas500$FSTAT==1],whas500.resid[,2], span=0.2,main="Smooth=0.2")
scatter.smooth(whas500$LENFOL[whas500$FSTAT==1],whas500.resid[,2], span=0.4,main="Smooth=0.4")
scatter.smooth(whas500$LENFOL[whas500$FSTAT==1],whas500.resid[,2], span=0.6,main="Smooth=0.6")
scatter.smooth(whas500$LENFOL[whas500$FSTAT==1],whas500.resid[,2], span=0.8,main="Smooth=0.8")

Survival Analysis in R

Robert Parker

November 13, 2017

Comparing Survival Functions

Cox’s Proportional Hazard Regression Model