In this lecture, we will discuss simulations in R. We will introduce the functions in R for simulating from commong probability distributions and their cdf, pdf, and quantile functions. Then, we will look at a simulation experiment involving the chi-square hypothesis test for one variance parameter.

Distributions

R has many built in probability distributions which all work in a similar manner. You should become familiar with the binomila, chi-square, F, normal, Poisson, student’s t, and uniform distributions. The functions for all of these distributions in R have the following form:

The pxxx and qxxx have a logical argument for lower.tail to choose between lower or upper tail probabilities.These distributions are useful for simulating data as well as for calculating critical values for confidence intervals and p-values for hypothesis tests.

Example 1. Binomila - require size (n) and prob (p)

# Part 1: Create a table and plot of the distribution 
#         function of a binomial with n=10 and p=0.25

# Define the possible x-values
x <- 0:10
# Create the probabilities of interest
y <- dbinom(x, 10, 0.25)
# Create a plot
barplot(y, names.arg=x, xlab="Possible Values of X", 
        ylab="P(X=x)", ylim=c(0,0.3), 
        main="PDF of Binomial (N=10 p=0.25)")
box()

# Create nice table output
tab <- round(data.frame(x,y), digits=4)
tab
##     x      y
## 1   0 0.0563
## 2   1 0.1877
## 3   2 0.2816
## 4   3 0.2503
## 5   4 0.1460
## 6   5 0.0584
## 7   6 0.0162
## 8   7 0.0031
## 9   8 0.0004
## 10  9 0.0000
## 11 10 0.0000
attributes(tab)$names[2] <- "P(X=x)"
# methods("print")
# help(print.data.frame)
print(tab, row.names=FALSE)
##   x P(X=x)
##   0 0.0563
##   1 0.1877
##   2 0.2816
##   3 0.2503
##   4 0.1460
##   5 0.0584
##   6 0.0162
##   7 0.0031
##   8 0.0004
##   9 0.0000
##  10 0.0000
# Part 2:  Create a table and plot of the CDF
y.cdf <- pbinom(x, 10, 0.25)
# Create a plot
barplot(y.cdf, names.arg=x, xlab="Possible Values of X", 
        ylab="Cumulative Probability", ylim=c(0,1), 
        main="CDF of Binomial (N=10 p=0.25)")
box()

# Create nice table output
tab <- round(data.frame(x,y, y.cdf), digits=4)
attributes(tab)$names[2] <- "P(X=x)"
attributes(tab)$names[3] <- "P(X <= x)"
print(tab, row.names=FALSE)
##   x P(X=x) P(X <= x)
##   0 0.0563    0.0563
##   1 0.1877    0.2440
##   2 0.2816    0.5256
##   3 0.2503    0.7759
##   4 0.1460    0.9219
##   5 0.0584    0.9803
##   6 0.0162    0.9965
##   7 0.0031    0.9996
##   8 0.0004    1.0000
##   9 0.0000    1.0000
##  10 0.0000    1.0000
# Part 3: Find the 25th, 50th, and 75th percentiles
qbinom(c(0.25, 0.50, 0.75), 10, 0.25)
## [1] 2 2 3
# Part 4: Generate a random sample of 100 different binomial random variables
samp.x <- rbinom(100, 10, 0.25)
par(mfrow=c(1,2))
hist(samp.x, breaks=seq(-0.5,10.5, 1), xlab="Possible Values of X", 
        main="Random Sample of 100 Binomial RVs (N=10 p=0.25)",
        xaxt="n")
axis(1, at=0:10, labels=0:10)
box()
barplot(y, names.arg=x, xlab="Possible Values of X", 
        ylab="P(X=x)", ylim=c(0,0.3), 
        main="PDF of Binomial (N=10 p=0.25)")
box()

par(mfrow=c(1,1))
# Maximize window for a good view of this graph

Example 2 Normal distribution - requires mean and sd

# Part 1: Create a plots of PDF and CDF of a normal distribution 
#         (mean=100, sd=16)

# Create a good coverage of the needed x-values (+/- 3 std devs)
100-16*3
## [1] 52
100+16*3
## [1] 148
x <- seq(50, 150, 0.5)
y.pdf <- dnorm(x, 100, 16)
y.cdf <- pnorm(x, 100, 16)
plot(x, y.pdf)

par(mfrow=c(1,2))
# In this case, plot will create a scatterplot by default
# we want to connect the points with lines and not see the points
plot(x, y.pdf, type="l")
plot(x, y.cdf, type="l")

par(mfrow=c(1,1))
# Part 2: Find quantiles/percentiles and p-values

# Suppose we want a 95% confidence interval
# We need Z_0.975 from a standard normal
qnorm(0.975, 0, 1)
## [1] 1.959964
# We can find specific percentiles for a given normal distribution
qnorm(c(0.25, 0.75), 100, 16)
## [1]  89.20816 110.79184
# Suppose the Z-value for a two-sided hypothesis test is 2.75
# we can calculate the p-value
half.pval <- pnorm(2.75, 0, 1, lower.tail=FALSE)
pval <- 2*half.pval
pval
## [1] 0.005959526
# Part 3: Random sample from a normal distribution
samp.x <- rnorm(200, 100, 16)
par(mfrow=c(1,2))
hist(samp.x)
box()
plot(x, y.pdf, type="l")

par(mfrow=c(1,1))

Simulation Experiment

For this simulation experiment, we will explore the properties of the hypothesis test and confidence interval for a population variance. If we sample from a normal population with variance \(\sigma^2\), then the we test \[H_0:\sigma^2=\sigma^2_0\;vs\;H_1:\sigma^2\neq\sigma^2_0\] by calculating the test statistc \[T=\dfrac{(n-1)s^2}{\sigma^2_0}\overset{H_0}{\sim}\chi^2(df=n-1).\] We reject the null hypothesis for this two sided test if \(T>\chi^2_{1-\alpha/2,n-1}\text{ or }\;T<\chi^2_{\alpha/2,n-1}\). The confidence interval for \(\sigma^2\) is given by \[\left(\dfrac{(n-1)s^2}{\chi^2_{1-\alpha/2,n-1}},\dfrac{(n-1)s^2}{\chi^2_{\alpha/2,n-1}}\right)\]

Let’s start with the condidence interval.

####################################
# Coverage Probability Simulations #
####################################

# Testing Basic Code
n<-10000
v<-1
x<-0
N<-50
for(i in 1:N){
sample <- rnorm(n, 0, v)
s.var <- var(sample)
chi.low <- qchisq(0.025,n-1)
chi.high <- qchisq(0.975,n-1)
ci.low <- (n-1)*s.var/chi.high
ci.high <- (n-1)*s.var/chi.low
#cat(ci.low, ci.high, "\n") #uncomment to see each confidence interval calculated
if(ci.low < v & ci.high > v) x<-x+1
}
cat("The coverage probability in this simulation is:",
x/N, "\n")
## The coverage probability in this simulation is: 0.98
# first simulation - normal population
# Since the population is normal, we know the exact coverage probability is 95%
sim1 <- function(n=100, v=1, N=10000){
x<-0
for(i in 1:N){
sample <- rnorm(n, 0, v)
s.var <- var(sample)
chi.low <- qchisq(0.025,n-1)
chi.high <- qchisq(0.975,n-1)
ci.low <- (n-1)*s.var/chi.high
ci.high <- (n-1)*s.var/chi.low
#cat(ci.low, ci.high, "\n") #uncomment to see each confidence interval calculated
if(ci.low < v & ci.high > v) x<-x+1
}
cat("The coverage probability in this simulation is:",
x/N, "\n")
}
# Run the default simulation
sim1()
## The coverage probability in this simulation is: 0.9507
# Run some alternative simulations
sim1(n=20, v=1, N=10000)
## The coverage probability in this simulation is: 0.9509
sim1(n=5, v=1, N=10000)
## The coverage probability in this simulation is: 0.9463
# second simulation - non-normal population
# With the population no longer normal, we don't know the exact coverage probability
sim2 <- function(n=20, N=10000,d=3){
x<-0
v<-2*d
for(i in 1:N){
sample <- rchisq(n, d)
s.var <- var(sample)
chi.low <- qchisq(0.025,n-1)
chi.high <- qchisq(0.975,n-1)
ci.low <- (n-1)*s.var/chi.high
ci.high <- (n-1)*s.var/chi.low
#cat(ci.low, ci.high, "\n") 
if(ci.low < v & ci.high > v) x<-x+1
}
cat("The coverage probability in this simulation is:",
x/N, "\n")
}

# Run the default simulation
sim2()
## The coverage probability in this simulation is: 0.7903
# Run some alternative simulations
sim2(n=500, N=10000, d=3)
## The coverage probability in this simulation is: 0.7493
sim2(n=500, N=10000, d=20)
## The coverage probability in this simulation is: 0.9125

Next, let’s explore the power and type I error probabilities for this hypothesis test for a population variance.

# Testing Basic Code
n<-100
v<-1
x<-0
N<-1000
null <- 0.85 # same as v (or 2d) give type I error different gives power
alpha <- 0.05
chi.lower <- qchisq(alpha/2,df=n-1)
chi.upper <- qchisq(1-alpha/2,df=n-1)
for(i in 1:N){
  sample <- rnorm(n, 0, v)
  s.var <- var(sample)
  chi.ts <- (n-1)*s.var/null
  if (chi.ts < chi.lower | chi.ts > chi.upper) x <- x + 1
  } # end for
  if (null==v) {
    cat("The Type I Error probability in this simulation is:",x/N," \n")
  } else {
    cat("The Power to detect variance of ",v, " when the null is ", null," in this simulation is:", x/N, " \n")
  } # end else
## The Power to detect variance of  1  when the null is  0.85  in this simulation is: 0.24
# simulation - either population
# Provide   n = sample size
#       N = number of samples in simulation
#       normal = TRUE
#           if TRUE provide m = mean and v = variance
#           if FALSE provide d = df for chisq
#       null = hypothesized mean

sim1 <- function(n=20, N=1000,normal=TRUE, v=1, d=3, null=1, alpha=0.05){
  x<-0
  chi.lower <- qchisq(alpha/2,df=n-1)
  chi.upper <- qchisq(1-alpha/2,df=n-1)
  if(normal==TRUE){
  for(i in 1:N){
    sample <- rnorm(n, 0, v)
    s.var <- var(sample)
    chi.ts <- (n-1)*s.var/null
    if (chi.ts < chi.lower | chi.ts > chi.upper) x <- x + 1
    } # end for
    if (null==v) {
      cat("The Type I Error probability in this simulation is:",x/N," \n")
    } else {
      cat("The Power to detect variance of ",v, " when the null is ", null, "\n in this simulation is:", x/N, " \n")
    } # end else
  } else {
  v<-2*d
  for(i in 1:N){
    sample <- rchisq(n, d)
    s.var <- var(sample)
    chi.ts <- (n-1)*s.var/null
    if (chi.ts < chi.lower | chi.ts > chi.upper) x <- x + 1
    } # end for
    if (null==v) {
      cat("The Type I Error probability in this simulation is:",x/N," \n")
    } else {
      cat("The Power to detect variance of", v, "when the null is ", null, "\n in this simulation is:", x/N, " \n")
    } # end else    
  } # end else
} # end function

# Run the default simulation
sim1()
## The Type I Error probability in this simulation is: 0.068
sim1(n=100)
## The Type I Error probability in this simulation is: 0.057
sim1(n=1000, N=10000)
## The Type I Error probability in this simulation is: 0.0514
# Run for type I error - non-normal
sim1(d=3, normal=FALSE, null=6)
## The Type I Error probability in this simulation is: 0.215
# Run for power - normal
sim1(null=0.85)
## The Power to detect variance of  1  when the null is  0.85 
##  in this simulation is: 0.086
sim1(n=100, N=10000, null=0.85, alpha=0.05)
## The Power to detect variance of  1  when the null is  0.85 
##  in this simulation is: 0.227
sim1(n=1000, N=10000, null=0.85, alpha=0.05)
## The Power to detect variance of  1  when the null is  0.85 
##  in this simulation is: 0.9518
# Run for power - non-normal
sim1(d=3, normal=FALSE, null=7)
## The Power to detect variance of 6 when the null is  7 
##  in this simulation is: 0.256
sim1(n=100, d=3, normal=FALSE, null=5)
## The Power to detect variance of 6 when the null is  5 
##  in this simulation is: 0.366
sim1(n=100, d=3, N=10000, normal=FALSE, null=5)
## The Power to detect variance of 6 when the null is  5 
##  in this simulation is: 0.3519