# read in the tidyverse
library(tidyverse)
# set seed: makes sure the "random" generation comes up with the same combination of numbers every time
set.seed(1)
# generate 10000 numbers from a uniform distribution for the population
<- runif(10000, min = 2, max = 8)
uniform
# turn the vector into a data frame
<- as.data.frame(uniform)
uniformdf
# make a histogram for the population
ggplot(data = uniformdf,
aes(x = uniform)) +
geom_histogram(breaks = seq(2, 8, length.out = 41),
fill = "firebrick",
alpha = 0.7,
color = "firebrick") +
geom_vline(xintercept = mean(uniform),
linewidth = 2) +
scale_x_continuous(breaks = seq(from = 2, to = 8, by = 1)) +
scale_y_continuous(expand = c(0, 0),
limits = c(0, 305)) +
labs(x = "Continuous value", y = "Count") +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text = element_text(size = 18),
axis.title = element_text(size = 18))
1. Description
In this optional problem, you’ll test out the Central Limit Theorem by repeatedly “sampling” a population with a uniform distribution with sample sizes of n = 2, n = 15, and n = 30. You’ll create histograms of the sampling distributions (i.e. means from the samples you generate) to see how the spread of the sampling distribution gets narrower with increasing sample size.
2. General guidance
You don’t have to follow these steps exactly (or at all), but this is a workflow that might make sense. Try it out on your own!
a. Steps
- Create a script or Quarto document to work in.
- Copy/paste the code in the Set up code chunk into your script. Run the code.
- Calculate the population mean. Store this as an object.
- Find the function that allows you to “sample” from a vector of numbers. If you don’t know the function, one google search could be “r sample numbers”.
- Resample (i.e. take a sample multiple times) 100 times from the population, taking a sample of n = 2 each time.
- Calculate the mean every time you take a sample. Store each mean in a list.
- Create a histogram of your sample means using the list from step 6.
- Repeat steps 2-5 for n = 15, and n = 30.
for()
loops
Doing repetitive tasks like steps 5-6 can get tiresome. You probably do not want to sample and calculate a mean “by hand” 300 times. Instead, you can write what’s called a for()
loop. One resource for writing for()
loops is in the chapter on Iteration in R for Data Science. There are other resources out there too! Try finding one that you like.
b. Set up code
3. Solution
a. Resampling using a for()
loop
# for() loop to sample 100x and calculate the mean
# creating holding vectors
<- c()
store2 <- c()
store15 <- c()
store30
for(i in 1:100) {
# sample from the population, calculate the mean, store that mean in the vector
<- mean(sample(uniform, 2, replace = FALSE))
store2[i] <- mean(sample(uniform, 15, replace = FALSE))
store15[i] <- mean(sample(uniform, 30, replace = FALSE))
store30[i]
}
# double checking that the holding vectors actually have values in them
head(store2)
[1] 5.274438 4.708106 5.089106 5.065707 4.728355 5.068479
head(store15)
[1] 5.642239 4.914242 4.723196 5.237746 4.314470 5.602474
head(store30)
[1] 5.432664 5.037553 5.165615 4.619024 4.573481 5.170263
b. n = 2 histogram
Before plotting the histogram, I’ll put the output from the for()
loop into a data frame.
# putting everything together in a data frame (not necessary but nice to do)
<- cbind(store2, store15, store30) %>%
df as.data.frame()
Then, I’ll plot the first histogram for n = 2.
# making a histogram for n = 2
ggplot(data = df) +
# making a histogram
geom_histogram(aes(x = store2),
bins = 10,
alpha = 0.7,
fill = "chocolate1",
color = "chocolate1") +
# controlling the axes
coord_cartesian(xlim = c(2, 8), ylim = c(0, 30)) +
scale_y_continuous(expand = c(0, 0)) +
# controlling plot aesthetics
labs(x = "Sample means", y = "Count") +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text = element_text(size = 18),
axis.title = element_text(size = 18),
plot.margin = unit(c(0.5, 0.5, 0.1, 0.1), "cm"))
c. n = 15 histogram
# histogram for n = 15
ggplot(data = df) +
# making a histogram
geom_histogram(aes(x = store15),
bins = 12,
alpha = 0.7,
fill = "darkorchid4",
color = "darkorchid4") +
# controlling the axes
coord_cartesian(xlim = c(2, 8), ylim = c(0, 30)) +
scale_y_continuous(expand = c(0, 0)) +
# controlling plot aesthetics
labs(x = "Sample means", y = "Count") +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text = element_text(size = 18),
axis.title = element_text(size = 18),
plot.margin = unit(c(0.5, 0.5, 0.1, 0.1), "cm"))
d. n = 30 histogram
# histogram for n = 30
ggplot(data = df) +
# making a histogram
geom_histogram(aes(x = store30),
bins = 12,
alpha = 0.7,
fill = "lightseagreen",
color = "lightseagreen") +
# controlling the axes
coord_cartesian(xlim = c(2, 8), ylim = c(0, 30)) +
scale_y_continuous(expand = c(0, 0)) +
# controlling plot aesthetics
labs(x = "Sample means", y = "Count") +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text = element_text(size = 18),
axis.title = element_text(size = 18),
plot.margin = unit(c(0.5, 0.5, 0.1, 0.1), "cm"))