Wide and Long Data Formats

We wil examine a longitudinal data set from a clinical trial for a seizure medication. The dataset is given in long format with the follwing column variables

First, we need to read in the dataset.

seizure.long <- read.table("H:\\BiostatCourses\\PublicHealthComputing\\Lectures\\Week5DataManagement\\R\\seizure.txt",
                           col.names = c("ID", "S.count", "visit", "Trt", "Age", "Weeks"))
head(seizure.long)
##    ID S.count visit Trt Age Weeks
## 1 104      11     0   0  31     8
## 2 104       5     1   0  31     2
## 3 104       3     2   0  31     2
## 4 104       3     3   0  31     2
## 5 104       3     4   0  31     2
## 6 106      11     0   0  30     8
length(seizure.long[,1]) #295 patients
## [1] 295

The R function reshape can be used to change between wide and long formats and is included with base R. To go from long to wide format we will need the following parameters in the reshape function

seizure.wide <- reshape(seizure.long, v.names = c("S.count", "Weeks", "Age"), timevar = "visit", idvar = "ID",direction = "wide")
head(seizure.wide)
##     ID Trt S.count.0 Weeks.0 Age.0 S.count.1 Weeks.1 Age.1 S.count.2
## 1  104   0        11       8    31         5       2    31         3
## 6  106   0        11       8    30         3       2    30         5
## 11 107   0         6       8    25         2       2    25         4
## 16 114   0         8       8    36         4       2    36         4
## 21 116   0        66       8    22         7       2    22        18
## 26 118   0        27       8    29         5       2    29         2
##    Weeks.2 Age.2 S.count.3 Weeks.3 Age.3 S.count.4 Weeks.4 Age.4
## 1        2    31         3       2    31         3       2    31
## 6        2    30         3       2    30         3       2    30
## 11       2    25         0       2    25         5       2    25
## 16       2    36         1       2    36         4       2    36
## 21       2    22         9       2    22        21       2    22
## 26       2    29         8       2    29         7       2    29

And to go back to long, we can do the following

seizure.long2 <- reshape(seizure.wide, 
                         varying = list(Weeks = c("Weeks.0", "Weeks.1", "Weeks.2", "Weeks.3", "Weeks.4"),
                                        Age = c("Age.0", "Age.1","Age.2","Age.3","Age.4"),
                                        S.count = c("S.count.0","S.count.1","S.count.2","S.count.3","S.count.4")),
                         timevar = "Visit", idvar = "ID", times = c(0,1,2,3,4))
head(seizure.long2[order(seizure.long2$Trt,seizure.long2$ID),]) #order returns the indicies of the original
##        ID Trt Visit Weeks.0 Age.0 S.count.0
## 104.0 104   0     0       8    31        11
## 104.1 104   0     1       2    31         5
## 104.2 104   0     2       2    31         3
## 104.3 104   0     3       2    31         3
## 104.4 104   0     4       2    31         3
## 106.0 106   0     0       8    30        11
                                                                #positions of the sorted elements

Merging Datasets

In this section, we will learn how to combine datasets by either stacking or performing a join.

Let’s look at some simple examples

# Two data sets on the same subjects but with different variables measures
data1 <- data.frame(ID=1:5, x=letters[1:5])
data2 <- data.frame(ID=1:5, y=letters[6:10])
data1
##   ID x
## 1  1 a
## 2  2 b
## 3  3 c
## 4  4 d
## 5  5 e
data2
##   ID y
## 1  1 f
## 2  2 g
## 3  3 h
## 4  4 i
## 5  5 j
cbind(data1,data2) #add columns
##   ID x ID y
## 1  1 a  1 f
## 2  2 b  2 g
## 3  3 c  3 h
## 4  4 d  4 i
## 5  5 e  5 j
cbind(data1,y = data2$y) #remove double ID column
##   ID x y
## 1  1 a f
## 2  2 b g
## 3  3 c h
## 4  4 d i
## 5  5 e j
#Stack using rbind - both datasets have the same column names and same number of columns
data1 <- data.frame(ID=1:5, x=letters[1:5])
data2 <- data.frame(ID=6:10, x=letters[6:10])
data1
##   ID x
## 1  1 a
## 2  2 b
## 3  3 c
## 4  4 d
## 5  5 e
data2
##   ID x
## 1  6 f
## 2  7 g
## 3  8 h
## 4  9 i
## 5 10 j
rbind(data1,data2)
##    ID x
## 1   1 a
## 2   2 b
## 3   3 c
## 4   4 d
## 5   5 e
## 6   6 f
## 7   7 g
## 8   8 h
## 9   9 i
## 10 10 j

The R functions cbind and rbind are pretty restrictive. A way to perform more general joins is the merge function. Merge will merge by all the variables that the two datasets have in common (same column name) and append all other variable as additional columns. By default, it will only take common values.

# Merge two datasets by an ID variable, where ID is the same for both datasets
data1 <- data.frame(ID=1:5, x=letters[1:5])
data2 <- data.frame(ID=1:5, y=letters[6:10])
data1
##   ID x
## 1  1 a
## 2  2 b
## 3  3 c
## 4  4 d
## 5  5 e
data2
##   ID y
## 1  1 f
## 2  2 g
## 3  3 h
## 4  4 i
## 5  5 j
merge(data1, data2)
##   ID x y
## 1  1 a f
## 2  2 b g
## 3  3 c h
## 4  4 d i
## 5  5 e j
# Merge two datasets by an ID variable, where ID is not the same for both datasets
data1 <- data.frame(ID=1:5, x=letters[1:5])
data2 <- data.frame(ID=4:8, y=letters[6:10])
data1
##   ID x
## 1  1 a
## 2  2 b
## 3  3 c
## 4  4 d
## 5  5 e
data2
##   ID y
## 1  4 f
## 2  5 g
## 3  6 h
## 4  7 i
## 5  8 j
merge(data1, data2)
##   ID x y
## 1  4 d f
## 2  5 e g
merge(data1, data2, all=TRUE)
##   ID    x    y
## 1  1    a <NA>
## 2  2    b <NA>
## 3  3    c <NA>
## 4  4    d    f
## 5  5    e    g
## 6  6 <NA>    h
## 7  7 <NA>    i
## 8  8 <NA>    j
merge(data1, data2, all.x=TRUE) # Only keep the rows from the 1st argument data1
##   ID x    y
## 1  1 a <NA>
## 2  2 b <NA>
## 3  3 c <NA>
## 4  4 d    f
## 5  5 e    g
merge(data1, data2, all.y=TRUE) # Only keep the rows from the 2nd argument data2
##   ID    x y
## 1  4    d f
## 2  5    e g
## 3  6 <NA> h
## 4  7 <NA> i
## 5  8 <NA> j
# Merge two datasets by an ID variable, where both dataset have the same names
data1 <- data.frame(ID=1:5, x=letters[1:5])
data2 <- data.frame(ID=1:5, x=letters[c(1:3,9:10)])
data1
##   ID x
## 1  1 a
## 2  2 b
## 3  3 c
## 4  4 d
## 5  5 e
data2
##   ID x
## 1  1 a
## 2  2 b
## 3  3 c
## 4  4 i
## 5  5 j
merge(data1, data2, all=TRUE)      # Add rows
##   ID x
## 1  1 a
## 2  2 b
## 3  3 c
## 4  4 d
## 5  4 i
## 6  5 e
## 7  5 j
merge(data1, data2, by="ID")       # Add columns
##   ID x.x x.y
## 1  1   a   a
## 2  2   b   b
## 3  3   c   c
## 4  4   d   i
## 5  5   e   j
merge(data1, data2, by="ID", suffixes=c(1, 2))
##   ID x1 x2
## 1  1  a  a
## 2  2  b  b
## 3  3  c  c
## 4  4  d  i
## 5  5  e  j
# Merge two datasets by an ID variable, where the ID variable has a different name
data1 <- data.frame(ID1=1:5, x=letters[1:5])
data2 <- data.frame(ID2=1:5, x=letters[6:10])
data1
##   ID1 x
## 1   1 a
## 2   2 b
## 3   3 c
## 4   4 d
## 5   5 e
data2
##   ID2 x
## 1   1 f
## 2   2 g
## 3   3 h
## 4   4 i
## 5   5 j
merge(data1,data2) #tries to match by x but no values in common
## [1] x   ID1 ID2
## <0 rows> (or 0-length row.names)
merge(data1,data2, all = T) #merging by x is not what we want
##    x ID1 ID2
## 1  a   1  NA
## 2  b   2  NA
## 3  c   3  NA
## 4  d   4  NA
## 5  e   5  NA
## 6  f  NA   1
## 7  g  NA   2
## 8  h  NA   3
## 9  i  NA   4
## 10 j  NA   5
merge(data1, data2, by.x="ID1", by.y="ID2")
##   ID1 x.x x.y
## 1   1   a   f
## 2   2   b   g
## 3   3   c   h
## 4   4   d   i
## 5   5   e   j

Data Cleaning

We will use the patient dataset again to examine checking for invalid data, missing values, and duplicates. Recall, this dataset has the following variables with the given valid values and ranges

First, we will read in the dataset.

patients <- read.fwf("H:\\BiostatCourses\\PublicHealthComputing\\Lectures\\Week5DataManagement\\SAS\\patients.txt",
                     widths = c(3,1,10,3,3,3,3,1), 
                     col.names = c("PATNO", "GENDER", "VISIT", "HR", "SBP", "DBP", "DX", "AE"),
                     stringsAsFactors = F)
patients
##    PATNO GENDER      VISIT  HR SBP DBP  DX AE
## 1    001      M 11/11/1998  88 140  80   1  0
## 2    002      F 11/13/1998  84 120  78   X  0
## 3    003      X 10/21/1998  68 190 100   3  1
## 4    004      F 01/01/1999 101 200 120   5  A
## 5    XX5      M 05/07/1998  68 120  80   1  0
## 6    006        06/15/1999  72 102  68   6  1
## 7    007      M 08/32/1998  88 148 102      0
## 8    008      F 08/08/1998 210  NA  NA   7  0
## 9    009      M 09/25/1999  86 240 180   4  1
## 10   010      F 10/19/1999      40 120   1  0
## 11   011      M 13/13/1998  68 300  20   4  1
## 12   012      M 10/12/98    60 122  74      0
## 13   013      2 08/23/1999  74 108  64   1   
## 14   014      M 02/02/1999  22 130  90      1
## 15   002      F 11/13/1998  84 120  78   X  0
## 16   003      M 11/12/1999  58 112  74      0
## 17   015      F             82 148  88   3  1
## 18   017      F 04/05/1999 208  NA  84   2  0
## 19   019      M 06/07/1999  58 118  70      0
## 20   123      M 15/12/1999  60  NA  NA   1  0
## 21   321      F            900 400 200   5  1
## 22   020      F 99/99/9999  10  20   8      0
## 23   022      M 10/10/1999  48 114  82   2  1
## 24   023      F 12/31/1998  22  34  78      0
## 25   024      F 11/09/1998 76  120  80   1  0
## 26   025      M 01/01/1999  74 102  68   5  1
## 27   027      F NOTAVAIL    NA 166 106   7  0
## 28   028      F 03/28/1998  66 150  90   3  0
## 29   029      M 05/15/1998      NA  NA   4  1
## 30   006      F 07/07/1999  82 148  84   1  0
sapply(patients,class)
##       PATNO      GENDER       VISIT          HR         SBP         DBP 
## "character" "character" "character" "character"   "integer"   "integer" 
##          DX          AE 
## "character" "character"

Note that visit should be a date and heart rate should be numeric, so we need to reformat the dataset before going any further.

#install.packages("lubridate")
library(lubridate) #package that handles dates
## Warning: package 'lubridate' was built under R version 3.4.1
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
patients$VISIT <- mdy(patients$VISIT) #convert to Date class from character date given as Month Day Year
## Warning: 7 failed to parse.
#HR was coerced to a factor becuase of the NA entry for patient
patients$HR <- as.numeric(patients$HR)
## Warning: NAs introduced by coercion

Now let’s check for invalid gender entries. They should be either “M” or “F”.

patients[!(patients$GENDER %in% c("M","F", " ")),c(1,2)]
##    PATNO GENDER
## 3    003      X
## 13   013      2

Let’s check the diganosis codes to see if they contain anything other than numbers.

#install.packages("stringr")
library(stringr) #contains string manipulation functions
## Warning: package 'stringr' was built under R version 3.4.1
split.dx <- str_split(patients$DX, "") #split each DX code into the individual components
                                                       #e.g. "123" splits to "1" "2" "3"
head(split.dx)
## [[1]]
## [1] " " " " "1"
## 
## [[2]]
## [1] " " " " "X"
## 
## [[3]]
## [1] " " " " "3"
## 
## [[4]]
## [1] " " " " "5"
## 
## [[5]]
## [1] " " " " "1"
## 
## [[6]]
## [1] " " " " "6"
split.match <- lapply(split.dx, FUN = match, table = c(" ","1","2","3","4","5","6","7","8","9", character(0)))
head(split.match)
## [[1]]
## [1] 1 1 2
## 
## [[2]]
## [1]  1  1 NA
## 
## [[3]]
## [1] 1 1 4
## 
## [[4]]
## [1] 1 1 6
## 
## [[5]]
## [1] 1 1 2
## 
## [[6]]
## [1] 1 1 7
DX.invalid <- sapply(lapply(split.match, is.na), any)
head(DX.invalid)
## [1] FALSE  TRUE FALSE FALSE FALSE FALSE
patients[DX.invalid, c("PATNO","DX")]
##    PATNO  DX
## 2    002   X
## 15   002   X

Next, let’s check for out of bounds blood pressure measurements. Be sure to be careful about comparing NA elements in R.

patients$SBP < 80 | patients$SBP > 200 #some SBP are NA
##  [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE    NA  TRUE  TRUE  TRUE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE    NA FALSE    NA  TRUE  TRUE
## [23] FALSE  TRUE FALSE FALSE FALSE FALSE    NA FALSE
patients[ifelse(is.na(patients$SBP < 80 | patients$SBP > 200) | patients$SBP < 80 | patients$SBP > 200,T,F), 
         c("PATNO", "SBP")]
##    PATNO SBP
## 8    008  NA
## 9    009 240
## 10   010  40
## 11   011 300
## 18   017  NA
## 20   123  NA
## 21   321 400
## 22   020  20
## 24   023  34
## 29   029  NA

We can also check for or remove the rows that have NA using

na.omit(patients)
##    PATNO GENDER      VISIT  HR SBP DBP  DX AE
## 1    001      M 1998-11-11  88 140  80   1  0
## 2    002      F 1998-11-13  84 120  78   X  0
## 3    003      X 1998-10-21  68 190 100   3  1
## 4    004      F 1999-01-01 101 200 120   5  A
## 5    XX5      M 1998-05-07  68 120  80   1  0
## 6    006        1999-06-15  72 102  68   6  1
## 9    009      M 1999-09-25  86 240 180   4  1
## 12   012      M 1998-10-12  60 122  74      0
## 13   013      2 1999-08-23  74 108  64   1   
## 14   014      M 1999-02-02  22 130  90      1
## 15   002      F 1998-11-13  84 120  78   X  0
## 16   003      M 1999-11-12  58 112  74      0
## 19   019      M 1999-06-07  58 118  70      0
## 23   022      M 1999-10-10  48 114  82   2  1
## 24   023      F 1998-12-31  22  34  78      0
## 25   024      F 1998-11-09  76 120  80   1  0
## 26   025      M 1999-01-01  74 102  68   5  1
## 28   028      F 1998-03-28  66 150  90   3  0
## 30   006      F 1999-07-07  82 148  84   1  0
patients[!complete.cases(patients),]
##    PATNO GENDER      VISIT  HR SBP DBP  DX AE
## 7    007      M       <NA>  88 148 102      0
## 8    008      F 1998-08-08 210  NA  NA   7  0
## 10   010      F 1999-10-19  NA  40 120   1  0
## 11   011      M       <NA>  68 300  20   4  1
## 17   015      F       <NA>  82 148  88   3  1
## 18   017      F 1999-04-05 208  NA  84   2  0
## 20   123      M       <NA>  60  NA  NA   1  0
## 21   321      F       <NA> 900 400 200   5  1
## 22   020      F       <NA>  10  20   8      0
## 27   027      F       <NA>  NA 166 106   7  0
## 29   029      M 1998-05-15  NA  NA  NA   4  1
#Duplicates
#?unique
#?duplicated

data1 <- data.frame(ID=1:5, x=letters[1:5])
data2 <- data.frame(ID=1:5, x=letters[c(1:3,9:10)])
data.dup <- rbind(data1,data2)
data.dup
##    ID x
## 1   1 a
## 2   2 b
## 3   3 c
## 4   4 d
## 5   5 e
## 6   1 a
## 7   2 b
## 8   3 c
## 9   4 i
## 10  5 j
unique(data.dup) #removes duplicate rows. Can use MARGIN = 2 to remove duplicate columns
##    ID x
## 1   1 a
## 2   2 b
## 3   3 c
## 4   4 d
## 5   5 e
## 9   4 i
## 10  5 j
duplicated(data.dup)
##  [1] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE
data.dup[duplicated(data.dup),] #These rows are duplicates
##   ID x
## 6  1 a
## 7  2 b
## 8  3 c