In this lecture, we will discuss simulations in R. We will introduce the functions in R for simulating from commong probability distributions and their cdf, pdf, and quantile functions. Then, we will look at a simulation experiment involving the chi-square hypothesis test for one variance parameter.
R has many built in probability distributions which all work in a similar manner. You should become familiar with the binomila, chi-square, F, normal, Poisson, student’s t, and uniform distributions. The functions for all of these distributions in R have the following form:
The pxxx and qxxx have a logical argument for lower.tail to choose between lower or upper tail probabilities.These distributions are useful for simulating data as well as for calculating critical values for confidence intervals and p-values for hypothesis tests.
Example 1. Binomila - require size (n) and prob (p)
# Part 1: Create a table and plot of the distribution
# function of a binomial with n=10 and p=0.25
# Define the possible x-values
x <- 0:10
# Create the probabilities of interest
y <- dbinom(x, 10, 0.25)
# Create a plot
barplot(y, names.arg=x, xlab="Possible Values of X",
ylab="P(X=x)", ylim=c(0,0.3),
main="PDF of Binomial (N=10 p=0.25)")
box()
# Create nice table output
tab <- round(data.frame(x,y), digits=4)
tab
## x y
## 1 0 0.0563
## 2 1 0.1877
## 3 2 0.2816
## 4 3 0.2503
## 5 4 0.1460
## 6 5 0.0584
## 7 6 0.0162
## 8 7 0.0031
## 9 8 0.0004
## 10 9 0.0000
## 11 10 0.0000
attributes(tab)$names[2] <- "P(X=x)"
# methods("print")
# help(print.data.frame)
print(tab, row.names=FALSE)
## x P(X=x)
## 0 0.0563
## 1 0.1877
## 2 0.2816
## 3 0.2503
## 4 0.1460
## 5 0.0584
## 6 0.0162
## 7 0.0031
## 8 0.0004
## 9 0.0000
## 10 0.0000
# Part 2: Create a table and plot of the CDF
y.cdf <- pbinom(x, 10, 0.25)
# Create a plot
barplot(y.cdf, names.arg=x, xlab="Possible Values of X",
ylab="Cumulative Probability", ylim=c(0,1),
main="CDF of Binomial (N=10 p=0.25)")
box()
# Create nice table output
tab <- round(data.frame(x,y, y.cdf), digits=4)
attributes(tab)$names[2] <- "P(X=x)"
attributes(tab)$names[3] <- "P(X <= x)"
print(tab, row.names=FALSE)
## x P(X=x) P(X <= x)
## 0 0.0563 0.0563
## 1 0.1877 0.2440
## 2 0.2816 0.5256
## 3 0.2503 0.7759
## 4 0.1460 0.9219
## 5 0.0584 0.9803
## 6 0.0162 0.9965
## 7 0.0031 0.9996
## 8 0.0004 1.0000
## 9 0.0000 1.0000
## 10 0.0000 1.0000
# Part 3: Find the 25th, 50th, and 75th percentiles
qbinom(c(0.25, 0.50, 0.75), 10, 0.25)
## [1] 2 2 3
# Part 4: Generate a random sample of 100 different binomial random variables
samp.x <- rbinom(100, 10, 0.25)
par(mfrow=c(1,2))
hist(samp.x, breaks=seq(-0.5,10.5, 1), xlab="Possible Values of X",
main="Random Sample of 100 Binomial RVs (N=10 p=0.25)",
xaxt="n")
axis(1, at=0:10, labels=0:10)
box()
barplot(y, names.arg=x, xlab="Possible Values of X",
ylab="P(X=x)", ylim=c(0,0.3),
main="PDF of Binomial (N=10 p=0.25)")
box()
par(mfrow=c(1,1))
# Maximize window for a good view of this graph
Example 2 Normal distribution - requires mean and sd
# Part 1: Create a plots of PDF and CDF of a normal distribution
# (mean=100, sd=16)
# Create a good coverage of the needed x-values (+/- 3 std devs)
100-16*3
## [1] 52
100+16*3
## [1] 148
x <- seq(50, 150, 0.5)
y.pdf <- dnorm(x, 100, 16)
y.cdf <- pnorm(x, 100, 16)
plot(x, y.pdf)
par(mfrow=c(1,2))
# In this case, plot will create a scatterplot by default
# we want to connect the points with lines and not see the points
plot(x, y.pdf, type="l")
plot(x, y.cdf, type="l")
par(mfrow=c(1,1))
# Part 2: Find quantiles/percentiles and p-values
# Suppose we want a 95% confidence interval
# We need Z_0.975 from a standard normal
qnorm(0.975, 0, 1)
## [1] 1.959964
# We can find specific percentiles for a given normal distribution
qnorm(c(0.25, 0.75), 100, 16)
## [1] 89.20816 110.79184
# Suppose the Z-value for a two-sided hypothesis test is 2.75
# we can calculate the p-value
half.pval <- pnorm(2.75, 0, 1, lower.tail=FALSE)
pval <- 2*half.pval
pval
## [1] 0.005959526
# Part 3: Random sample from a normal distribution
samp.x <- rnorm(200, 100, 16)
par(mfrow=c(1,2))
hist(samp.x)
box()
plot(x, y.pdf, type="l")
par(mfrow=c(1,1))
For this simulation experiment, we will explore the properties of the hypothesis test and confidence interval for a population variance. If we sample from a normal population with variance \(\sigma^2\), then the we test \[H_0:\sigma^2=\sigma^2_0\;vs\;H_1:\sigma^2\neq\sigma^2_0\] by calculating the test statistc \[T=\dfrac{(n-1)s^2}{\sigma^2_0}\overset{H_0}{\sim}\chi^2(df=n-1).\] We reject the null hypothesis for this two sided test if \(T>\chi^2_{1-\alpha/2,n-1}\text{ or }\;T<\chi^2_{\alpha/2,n-1}\). The confidence interval for \(\sigma^2\) is given by \[\left(\dfrac{(n-1)s^2}{\chi^2_{1-\alpha/2,n-1}},\dfrac{(n-1)s^2}{\chi^2_{\alpha/2,n-1}}\right)\]
Let’s start with the condidence interval.
####################################
# Coverage Probability Simulations #
####################################
# Testing Basic Code
n<-10000
v<-1
x<-0
N<-50
for(i in 1:N){
sample <- rnorm(n, 0, v)
s.var <- var(sample)
chi.low <- qchisq(0.025,n-1)
chi.high <- qchisq(0.975,n-1)
ci.low <- (n-1)*s.var/chi.high
ci.high <- (n-1)*s.var/chi.low
#cat(ci.low, ci.high, "\n") #uncomment to see each confidence interval calculated
if(ci.low < v & ci.high > v) x<-x+1
}
cat("The coverage probability in this simulation is:",
x/N, "\n")
## The coverage probability in this simulation is: 0.98
# first simulation - normal population
# Since the population is normal, we know the exact coverage probability is 95%
sim1 <- function(n=100, v=1, N=10000){
x<-0
for(i in 1:N){
sample <- rnorm(n, 0, v)
s.var <- var(sample)
chi.low <- qchisq(0.025,n-1)
chi.high <- qchisq(0.975,n-1)
ci.low <- (n-1)*s.var/chi.high
ci.high <- (n-1)*s.var/chi.low
#cat(ci.low, ci.high, "\n") #uncomment to see each confidence interval calculated
if(ci.low < v & ci.high > v) x<-x+1
}
cat("The coverage probability in this simulation is:",
x/N, "\n")
}
# Run the default simulation
sim1()
## The coverage probability in this simulation is: 0.9507
# Run some alternative simulations
sim1(n=20, v=1, N=10000)
## The coverage probability in this simulation is: 0.9509
sim1(n=5, v=1, N=10000)
## The coverage probability in this simulation is: 0.9463
# second simulation - non-normal population
# With the population no longer normal, we don't know the exact coverage probability
sim2 <- function(n=20, N=10000,d=3){
x<-0
v<-2*d
for(i in 1:N){
sample <- rchisq(n, d)
s.var <- var(sample)
chi.low <- qchisq(0.025,n-1)
chi.high <- qchisq(0.975,n-1)
ci.low <- (n-1)*s.var/chi.high
ci.high <- (n-1)*s.var/chi.low
#cat(ci.low, ci.high, "\n")
if(ci.low < v & ci.high > v) x<-x+1
}
cat("The coverage probability in this simulation is:",
x/N, "\n")
}
# Run the default simulation
sim2()
## The coverage probability in this simulation is: 0.7903
# Run some alternative simulations
sim2(n=500, N=10000, d=3)
## The coverage probability in this simulation is: 0.7493
sim2(n=500, N=10000, d=20)
## The coverage probability in this simulation is: 0.9125
Next, let’s explore the power and type I error probabilities for this hypothesis test for a population variance.
# Testing Basic Code
n<-100
v<-1
x<-0
N<-1000
null <- 0.85 # same as v (or 2d) give type I error different gives power
alpha <- 0.05
chi.lower <- qchisq(alpha/2,df=n-1)
chi.upper <- qchisq(1-alpha/2,df=n-1)
for(i in 1:N){
sample <- rnorm(n, 0, v)
s.var <- var(sample)
chi.ts <- (n-1)*s.var/null
if (chi.ts < chi.lower | chi.ts > chi.upper) x <- x + 1
} # end for
if (null==v) {
cat("The Type I Error probability in this simulation is:",x/N," \n")
} else {
cat("The Power to detect variance of ",v, " when the null is ", null," in this simulation is:", x/N, " \n")
} # end else
## The Power to detect variance of 1 when the null is 0.85 in this simulation is: 0.24
# simulation - either population
# Provide n = sample size
# N = number of samples in simulation
# normal = TRUE
# if TRUE provide m = mean and v = variance
# if FALSE provide d = df for chisq
# null = hypothesized mean
sim1 <- function(n=20, N=1000,normal=TRUE, v=1, d=3, null=1, alpha=0.05){
x<-0
chi.lower <- qchisq(alpha/2,df=n-1)
chi.upper <- qchisq(1-alpha/2,df=n-1)
if(normal==TRUE){
for(i in 1:N){
sample <- rnorm(n, 0, v)
s.var <- var(sample)
chi.ts <- (n-1)*s.var/null
if (chi.ts < chi.lower | chi.ts > chi.upper) x <- x + 1
} # end for
if (null==v) {
cat("The Type I Error probability in this simulation is:",x/N," \n")
} else {
cat("The Power to detect variance of ",v, " when the null is ", null, "\n in this simulation is:", x/N, " \n")
} # end else
} else {
v<-2*d
for(i in 1:N){
sample <- rchisq(n, d)
s.var <- var(sample)
chi.ts <- (n-1)*s.var/null
if (chi.ts < chi.lower | chi.ts > chi.upper) x <- x + 1
} # end for
if (null==v) {
cat("The Type I Error probability in this simulation is:",x/N," \n")
} else {
cat("The Power to detect variance of", v, "when the null is ", null, "\n in this simulation is:", x/N, " \n")
} # end else
} # end else
} # end function
# Run the default simulation
sim1()
## The Type I Error probability in this simulation is: 0.068
sim1(n=100)
## The Type I Error probability in this simulation is: 0.057
sim1(n=1000, N=10000)
## The Type I Error probability in this simulation is: 0.0514
# Run for type I error - non-normal
sim1(d=3, normal=FALSE, null=6)
## The Type I Error probability in this simulation is: 0.215
# Run for power - normal
sim1(null=0.85)
## The Power to detect variance of 1 when the null is 0.85
## in this simulation is: 0.086
sim1(n=100, N=10000, null=0.85, alpha=0.05)
## The Power to detect variance of 1 when the null is 0.85
## in this simulation is: 0.227
sim1(n=1000, N=10000, null=0.85, alpha=0.05)
## The Power to detect variance of 1 when the null is 0.85
## in this simulation is: 0.9518
# Run for power - non-normal
sim1(d=3, normal=FALSE, null=7)
## The Power to detect variance of 6 when the null is 7
## in this simulation is: 0.256
sim1(n=100, d=3, normal=FALSE, null=5)
## The Power to detect variance of 6 when the null is 5
## in this simulation is: 0.366
sim1(n=100, d=3, N=10000, normal=FALSE, null=5)
## The Power to detect variance of 6 when the null is 5
## in this simulation is: 0.3519