Week 1 figures - Lectures 1 and 2

central tendency
spread
regression
histogram
jitterplot
box-and-whisker plot
probability mass
probability density
probability distribution
normal distribution
z-score calculation
t-distribution
uniform distribution
binomial distribution
poisson distribution
Author
Affiliation
Lecture date

April 1, 2024

Modified

June 13, 2024

0. set up

Code
# cleaning
library(tidyverse)
theme_set(theme_classic() +
            theme(panel.grid = element_blank(),
                  axis.text = element_text(size = 18),
                  axis.title = element_text(size = 18),
                  text = element_text(family = "Lato")))

# visualization
library(patchwork)

1. Math

sample mean

\[ \bar{y} = \frac{1}{n}\sum_{i = 1}^ny_i \]

sample variance

\[ s^2 = \frac{\sum(y_i - \bar{y})^2}{n - 1} \]

sample standard deviation

\[ s = \sqrt{\frac{\sum(y_i - \bar{y})^2}{n - 1}} \]

coefficient of variation

\[ CV = \frac{\sigma}{\mu} \]

z-score for selecting a single individual

\[ z = \frac{y_i - \mu}{\sigma} \]

2. Mean and median

For data following a symmetrical distribution, the mean and median tend to be similar.

Code
set.seed(1)
rnorm(n = 40, mean = 6, sd = 1) %>% 
  as_tibble() %>% 
  ggplot(aes(x = value)) +
  geom_density() +
  geom_vline(aes(xintercept = mean(value)), color = "blue") +
  annotate("text", x = 5.75, y = 0.5, label = "mean", color = "blue") +
  geom_vline(aes(xintercept = median(value))) +
  annotate("text", x = 6.5, y  = 0.5, label = "median") +
  scale_x_continuous(limits = c(2.5, 10)) +
  scale_y_continuous(limits = c(0, 0.5)) +
  labs(x = "Sculpin lengths (cm)") +
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank())

3. Range

Code
set.seed(1)
narrow <- rnorm(n = 30, mean = 6, sd = 1) %>% 
  as_tibble() %>% 
  mutate(y = 0) %>% 
  ggplot(aes(x = value, y = y)) +
  geom_jitter(shape = 21) +
  geom_point(aes(x = mean(value), y = 0), color = "blue", size = 3) +
  scale_x_continuous(limits = c(0, 15)) +
  scale_y_continuous(limits = c(-0.5, 0.5)) +
  theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.y = element_blank()) +
  labs(x = "Sculpin lengths (cm)")
# min: 3.78
# max: 7.60

set.seed(1)
wide <- rnorm(n = 30, mean = 6, sd = 2) %>% 
  as_tibble() %>% 
  mutate(y = 0) %>% 
  ggplot(aes(x = value, y = y)) +
  geom_jitter(shape = 21) +
    geom_point(aes(x = mean(value), y = 0), color = "blue", size = 3) +
  scale_x_continuous(limits = c(0, 15)) +
  scale_y_continuous(limits = c(-0.5, 0.5)) +
  theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.y = element_blank()) +
  labs(x = "Sculpin lengths (cm)")
# min: 1.57
# max: 9.19

narrow + wide

How would you describe this data?

Code
set.seed(1)
ex1 <- rf(n = 100, df1 = 30, df2 = 10)
mean(ex1)
[1] 1.321259
Code
median(ex1)
[1] 1.163362
Code
ex1 %>% 
  enframe() %>% 
  ggplot(aes(x = value)) +
  geom_histogram(bins = 9,
                 color = "#000000",
                 fill = "orange") +
  scale_y_continuous(expand = c(0, 0)) +
  labs(x = "Hermit crab shell length (cm)")

Code
set.seed(1)
ex2 <- rnorm(n = 100, mean = 25, sd = 5)
mean(ex2)
[1] 25.54444
Code
median(ex2)
[1] 25.56955
Code
ex2 %>% 
  enframe() %>% 
  ggplot(aes(x = value)) +
  geom_histogram(bins = 9,
                 color = "#000000",
                 fill = "darkgreen") +
  scale_y_continuous(expand = c(0, 0)) +
  labs(x = "Octopus arm length (cm)")

4. anemone regression example

Code
# number of arms 
arms <- seq(from = 40, to = 100, by = 1)

# diameter: anemones can be up to 8 cm long
set.seed(10)
diam <- rnorm(length(arms), mean = seq(from = 1, to = 5, length = length(arms)), sd = 1) 

# create a data frame
df <- cbind(diam, arms) %>% 
  as.data.frame()

ggplot(df, aes(x = arms, y = diam)) +
  geom_point(size = 2) +
  labs(x = "Number of arms", y = "Diameter (cm)")

Code
ggplot(df, aes(x = arms, y = diam)) +
  geom_point(size = 2) +
  # just using geom smooth for the purposes of visualization
  geom_smooth(method = "lm", se = FALSE, linewidth = 2) +
  labs(x = "Number of arms", y = "Diameter (cm)") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato"))

5. histogram example

The Rice rule guidelines for the calculating the number of bins in a histogram:

\[ bins = 2n^{1/3} \]

where \(n\) is the number of observations. This is an example of a histogram that does follow the rice rule, where the bin number is 8.

Code
ggplot(df, aes(x = diam)) +
  scale_x_continuous(breaks = seq(from = 0, to = 8, by = 1)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 19), breaks = seq(from = 0, to = 18, by = 3)) +
  geom_histogram(breaks = seq(from = 0, to = 8, by = 1), color = "#000000", fill = "lightblue") +
  labs(x = "Anemone diameter (cm)", y = "Count") +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato")) 

These histograms do not, and it proves difficult to see the distribution:

Code
ggplot(df, aes(x = diam)) +
  scale_x_continuous(breaks = seq(from = 0, to = 8, by = 1)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 19), breaks = seq(from = 0, to = 18, by = 3)) +
  geom_histogram(color = "#000000", fill = "lightblue") +
  labs(x = "Anemone diameter (cm)", y = "Count") +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato")) 

Code
ggplot(df, aes(x = diam)) +
  scale_x_continuous(breaks = seq(from = 0, to = 8, by = 1)) +
  scale_y_continuous(expand = c(0, 0)) +
  geom_histogram(color = "#000000", fill = "lightblue", bins = 3) +
  labs(x = "Anemone diameter (cm)", y = "Count") +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato")) 

6. jitter plot and box and whisker plot example

Code
set.seed(1)

pretend_lengths <- cbind(
  juveniles = rnorm(20, mean = 2, sd = 0.5), 
  females = rnorm(20, mean = 8, sd = 1), 
  males = rnorm(20, mean = 4, sd = 1)
) %>% 
  as_tibble() %>% 
  pivot_longer(cols = 1:3)

ggplot(pretend_lengths, aes(x = name, y = value, color = name)) +
  geom_jitter(width = 0.1, alpha = 0.8, size = 2) +
  scale_color_manual(values = c("darkgreen", "cornflowerblue", "orange")) +
  labs(y = "Weight (g)") +
  theme(axis.title.x = element_blank(),
        legend.position = "none")

Code
ggplot(pretend_lengths, aes(x = name, y = value, color = name, fill = name)) +
  geom_boxplot(alpha = 0.8) +
  scale_color_manual(values = c("darkgreen", "cornflowerblue", "orange")) +
  scale_fill_manual(values = c("darkgreen", "cornflowerblue", "orange")) +
  labs(y = "Weight (g)") +
  theme(axis.title.x = element_blank(),
        legend.position = "none")

7. Probability mass example

Code
ggplot(data.frame(x = 1:55), aes(x)) +
  stat_function(geom = "bar", n = 55, fun = dpois, args = list(lambda = 10), fill = "coral") +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.13)) +
  coord_cartesian(xlim = c(0, 22)) +
  labs(x = "Mussel clump size (count)", y = "Probability mass") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato")) 

8. Probability density example

Code
ggplot(data.frame(x = 1:20), aes(x)) +
  stat_function(geom = "line", n = 100, fun = dnorm, args = list(mean = 10, sd = 2), linewidth = 1) +
  stat_function(geom = "area", fun = dnorm, args = list(mean = 10, sd = 2), xlim = c(12, 14), fill = "turquoise3") +
  geom_vline(xintercept = 12, lty = 2, color = "grey", linewidth = 1) +
  geom_vline(xintercept = 14, lty = 2, color = "grey", linewidth = 1) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.22)) +
  # coord_cartesian(xlim = c(0, 22)) +
  labs(x = "Individual mussel weight (g)", y = "Probability density") 

9. probability distribution

Code
set.seed(1)
normdist <- rnorm(n = 100000, mean = 0, sd = 1) %>% 
  as_tibble(rownames = "x")

ggplot(normdist) +
  geom_histogram(aes(x = value, after_stat(density)), fill = "white", color = "black", bins = 100) +
  stat_function(fun = dnorm, args = list(mean = 0, sd = 1), color = "blue", linewidth = 2) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) +
  labs(x = "Continuous value", y = "Density") 

10. normal distribution

Code
ggplot(data.frame(x = -10:25), aes(x)) +
  stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 0, sd = 1), linewidth = 1, color = "darkorange") +
  annotate("text", x = 4.5, y = 0.4, label = "\U03BC = 0, \U03C3 = 1", color = "darkorange", size = 6) +
  stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 15, sd = 3), linewidth = 1, color = "blue") +
  annotate("text", x = 16, y = 0.15, label = "\U03BC = 15, \U03C3 = 3", color = "blue", size = 6) +
  stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 5, sd = 5), linewidth = 1, color = "darkgreen") +
  annotate("text", x = 7, y = 0.1, label = "\U03BC = 5, \U03C3 = 5", color = "darkgreen", size = 6) +
  scale_x_continuous(breaks = seq(-10, 25, 5)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) +
  labs(x = "Continuous value", y = "Density") 

11. z-score calculation

figure

We’ll use \(z = -1.23\) for this example.

Code
# z-score
q <- -1.23

ggplot(data.frame(x = -4:4), aes(x)) +
  # zscore
  geom_linerange(x = q, ymin = 0, ymax = 0.19) +
  # area under the curve
  stat_function(geom = "area", fun = dnorm, args = list(mean = 0, sd = 1), xlim = c(-4, -1.23), fill = "turquoise3") +
  # Z distribution curve
  stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 0, sd = 1), linewidth = 1.5, color = "darkorange") +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.45)) +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title = element_blank(),
        axis.line.y = element_blank())

calculation

Code
pnorm(q, mean = 0, sd = 1)
[1] 0.1093486

You can compare this with the Z-score table.

chiton example

What is the probability of selecting a chiton that is less than 6 ft long given a normally distributed population with \(\mu = 12\) g with \(\sigma = 3\) g?

Code
# calculate the z-score
chiton_z <- (6 - 12)/3
  
# calculate the probability under the curve
pnorm(chiton_z, mean = 0, sd = 1)
[1] 0.02275013

12. 68-95-99.7 rule

In a normal distribution, 68% of values lie within 1 standard deviation of the mean, 95% within 2 standard deviations, and 99.7% within 3 standard deviations.

Code
labels <- c(
  "", "\U03BC - 3\U03C3", "\U03BC - 2\U03C3", "\U03BC - \U03C3", "\U03BC", "\U03BC + \U03C3", "\U03BC + 2\U03C3", "\U03BC + 3\U03C3", ""
)

ggplot(data.frame(x = -4:4), aes(x)) +
  geom_linerange(x = 1, ymin = 0, ymax = 0.24) +
  geom_linerange(x = -1, ymin = 0, ymax = 0.24) +
  geom_linerange(x = 2, ymin = 0, ymax = 0.055) +
  geom_linerange(x = -2, ymin = 0, ymax = 0.055) +
  geom_linerange(x = 3, ymin = 0, ymax = 0.005) +
  geom_linerange(x = -3, ymin = 0, ymax = 0.005) +
  geom_linerange(x = 0, ymin = 0, ymax = 0.399) +
  stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 0, sd = 1), linewidth = 1.5, color = "darkorange") +
  scale_x_continuous(labels = labels, breaks = seq(-4, 4, by = 1)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.41)) +
  labs(x = "") +
  theme_classic() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 24),
        axis.line.y = element_blank(),
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank()) 

13. Student’s t distribution

Code
ggplot(data.frame(x = -10:10), aes(x)) +
  stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 1), linewidth = 1, color = "#856F33") +
  annotate("text", x = 3.5, y = 0.3, label = "\U03BD = 1", color = "#856F33", size = 6) +
  stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 3), linewidth = 1, color = "#E6821C") + 
  annotate("text", x = 3.5, y = 0.35, label = "\U03BD = 3", color = "#E6821C", size = 6) +
  stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 5), linewidth = 1, color = "#56E9E7") +
  annotate("text", x = 3.5, y = 0.37, label = "\U03BD = 5", color = "#56E9E7", size = 6) +
  stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 100), linewidth = 1, color = "#04B37F") +
    annotate("text", x = 3.5, y = 0.4, label = "\U03BD = 100", color = "#04B37F", size = 6) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) +
  labs(x = "Continuous value", y = "Density") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato")) 

14. Uniform distribution

Code
ggplot(data.frame(x = 0:10), aes(x)) +
  stat_function(geom = "line", n = 1000, fun = dunif, args = list(min = 2, max = 8), linewidth = 1, color = "firebrick4") +
  annotate("text", x = 2, y = 0.172, label = "a = 2", color = "firebrick4", size = 6) + 
  annotate("text", x = 8, y = 0.172, label = "b = 8", color = "firebrick4", size = 6) + 
  scale_x_continuous(breaks = seq(0, 10, 2)) +
  scale_y_continuous(expand = c(0, 0), limits = c(-0.001, 0.18)) +
  labs(x = "Continuous value", y = "Density") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato")) 

14. Binomial distribution

Code
ggplot(data.frame(x = 1:20), aes(x)) +
  stat_function(geom = "line", n = 20, fun = dbinom, args = list(size = 20, p = 0.1), color = "black") +
  stat_function(geom = "point", n = 20, fun = dbinom, args = list(size = 20, p = 0.1), color = "#6D9929", size = 3) +
  annotate("text", x = 5.5, y = 0.29, label = "n = 20, p = 0.1", color = "#6D9929", size = 6) +
  stat_function(geom = "line", n = 20, fun = dbinom, args = list(size = 20, p = 0.4), color = "black") +
  stat_function(geom = "point", n = 20, fun = dbinom, args = list(size = 20, p = 0.4), color = "#4A76E5", size = 3) +
  annotate("text", x = 8, y = 0.2, label = "n = 20, p = 0.4", color = "#4A76E5", size = 6) +
  stat_function(geom = "line", n = 20, fun = dbinom, args = list(size = 20, p = 0.7), color = "black") +
  stat_function(geom = "point", n = 20, fun = dbinom, args = list(size = 20, p = 0.7), color = "#E67960", size = 3) +
  annotate("text", x = 15, y = 0.21, label = "n = 20, p = 0.7", color = "#E67960", size = 6) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.32)) +
  labs(x = "Number of successes", y = "Mass") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato")) 

15. Poisson distribution

Code
ggplot(data.frame(x = 1:20), aes(x)) +
  stat_function(geom = "line", n = 20, fun = dpois, args = list(lambda = 1), color = "black") +
  stat_function(geom = "point", n = 20, fun = dpois, args = list(lambda = 1), color = "coral", size = 4) +
  annotate("text", x = 3, y = 0.37, label = "\U03BB = 1", color = "coral", size = 6) +
  stat_function(geom = "line", n = 20, fun = dpois, args = list(lambda = 4), color = "black") +
  stat_function(geom = "point", n = 20, fun = dpois, args = list(lambda = 4), color = "darkgreen", size = 4) +
  annotate("text", x = 6, y = 0.2, label = "\U03BB = 4", color = "darkgreen", size = 6) +
  stat_function(geom = "line", n = 20, fun = dpois, args = list(lambda = 10), color = "black") +
  stat_function(geom = "point", n = 20, fun = dpois, args = list(lambda = 10), color = "turquoise", size = 4) +
  annotate("text", x = 14, y = 0.12, label = "\U03BB = 10", color = "turquoise", size = 6) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) +
  labs(x = "Discrete value", y = "Mass") 

Citation

BibTeX citation:
@online{bui2024,
  author = {Bui, An},
  title = {Week 1 Figures - {Lectures} 1 and 2},
  date = {2024-04-01},
  url = {https://spring-2024.envs-193ds.com/lecture/lecture_week-01.html},
  langid = {en}
}
For attribution, please cite this work as:
Bui, An. 2024. “Week 1 Figures - Lectures 1 and 2.” April 1, 2024. https://spring-2024.envs-193ds.com/lecture/lecture_week-01.html.