Spring 2024 - Week 1 figures - Lectures 1 and 2

0. set up

Code

# cleaning
library(tidyverse)
theme_set(theme_classic() +
            theme(panel.grid = element_blank(),
                  axis.text = element_text(size = 18),
                  axis.title = element_text(size = 18),
                  text = element_text(family = "Lato")))

# visualization
library(patchwork)

1. Math

sample mean

\[ \bar{y} = \frac{1}{n}\sum_{i = 1}^ny_i \]

sample variance

\[ s^2 = \frac{\sum(y_i - \bar{y})^2}{n - 1} \]

sample standard deviation

\[ s = \sqrt{\frac{\sum(y_i - \bar{y})^2}{n - 1}} \]

coefficient of variation

\[ CV = \frac{\sigma}{\mu} \]

z-score for selecting a single individual

\[ z = \frac{y_i - \mu}{\sigma} \]

2. Mean and median

For data following a symmetrical distribution, the mean and median tend to be similar.

Code

set.seed(1)
rnorm(n = 40, mean = 6, sd = 1) %>% 
  as_tibble() %>% 
  ggplot(aes(x = value)) +
  geom_density() +
  geom_vline(aes(xintercept = mean(value)), color = "blue") +
  annotate("text", x = 5.75, y = 0.5, label = "mean", color = "blue") +
  geom_vline(aes(xintercept = median(value))) +
  annotate("text", x = 6.5, y  = 0.5, label = "median") +
  scale_x_continuous(limits = c(2.5, 10)) +
  scale_y_continuous(limits = c(0, 0.5)) +
  labs(x = "Sculpin lengths (cm)") +
  theme(axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.line.y = element_blank())

3. Range

Code

set.seed(1)
narrow <- rnorm(n = 30, mean = 6, sd = 1) %>% 
  as_tibble() %>% 
  mutate(y = 0) %>% 
  ggplot(aes(x = value, y = y)) +
  geom_jitter(shape = 21) +
  geom_point(aes(x = mean(value), y = 0), color = "blue", size = 3) +
  scale_x_continuous(limits = c(0, 15)) +
  scale_y_continuous(limits = c(-0.5, 0.5)) +
  theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.y = element_blank()) +
  labs(x = "Sculpin lengths (cm)")
# min: 3.78
# max: 7.60

set.seed(1)
wide <- rnorm(n = 30, mean = 6, sd = 2) %>% 
  as_tibble() %>% 
  mutate(y = 0) %>% 
  ggplot(aes(x = value, y = y)) +
  geom_jitter(shape = 21) +
    geom_point(aes(x = mean(value), y = 0), color = "blue", size = 3) +
  scale_x_continuous(limits = c(0, 15)) +
  scale_y_continuous(limits = c(-0.5, 0.5)) +
  theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.y = element_blank(),
        axis.title.y = element_blank()) +
  labs(x = "Sculpin lengths (cm)")
# min: 1.57
# max: 9.19

narrow + wide

How would you describe this data?

Code

set.seed(1)
ex1 <- rf(n = 100, df1 = 30, df2 = 10)
mean(ex1)

[1] 1.321259

Code

median(ex1)

[1] 1.163362

Code

ex1 %>% 
  enframe() %>% 
  ggplot(aes(x = value)) +
  geom_histogram(bins = 9,
                 color = "#000000",
                 fill = "orange") +
  scale_y_continuous(expand = c(0, 0)) +
  labs(x = "Hermit crab shell length (cm)")

Code

set.seed(1)
ex2 <- rnorm(n = 100, mean = 25, sd = 5)
mean(ex2)

[1] 25.54444

Code

median(ex2)

[1] 25.56955

Code

ex2 %>% 
  enframe() %>% 
  ggplot(aes(x = value)) +
  geom_histogram(bins = 9,
                 color = "#000000",
                 fill = "darkgreen") +
  scale_y_continuous(expand = c(0, 0)) +
  labs(x = "Octopus arm length (cm)")

4. anemone regression example

Code

# number of arms 
arms <- seq(from = 40, to = 100, by = 1)

# diameter: anemones can be up to 8 cm long
set.seed(10)
diam <- rnorm(length(arms), mean = seq(from = 1, to = 5, length = length(arms)), sd = 1) 

# create a data frame
df <- cbind(diam, arms) %>% 
  as.data.frame()

ggplot(df, aes(x = arms, y = diam)) +
  geom_point(size = 2) +
  labs(x = "Number of arms", y = "Diameter (cm)")

Code

ggplot(df, aes(x = arms, y = diam)) +
  geom_point(size = 2) +
  # just using geom smooth for the purposes of visualization
  geom_smooth(method = "lm", se = FALSE, linewidth = 2) +
  labs(x = "Number of arms", y = "Diameter (cm)") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato"))

5. histogram example

The Rice rule guidelines for the calculating the number of bins in a histogram:

\[ bins = 2n^{1/3} \]

where $n$ is the number of observations. This is an example of a histogram that does follow the rice rule, where the bin number is 8.

Code

ggplot(df, aes(x = diam)) +
  scale_x_continuous(breaks = seq(from = 0, to = 8, by = 1)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 19), breaks = seq(from = 0, to = 18, by = 3)) +
  geom_histogram(breaks = seq(from = 0, to = 8, by = 1), color = "#000000", fill = "lightblue") +
  labs(x = "Anemone diameter (cm)", y = "Count") +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato"))

These histograms do not, and it proves difficult to see the distribution:

Code

ggplot(df, aes(x = diam)) +
  scale_x_continuous(breaks = seq(from = 0, to = 8, by = 1)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 19), breaks = seq(from = 0, to = 18, by = 3)) +
  geom_histogram(color = "#000000", fill = "lightblue") +
  labs(x = "Anemone diameter (cm)", y = "Count") +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato"))

Code

ggplot(df, aes(x = diam)) +
  scale_x_continuous(breaks = seq(from = 0, to = 8, by = 1)) +
  scale_y_continuous(expand = c(0, 0)) +
  geom_histogram(color = "#000000", fill = "lightblue", bins = 3) +
  labs(x = "Anemone diameter (cm)", y = "Count") +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato"))

6. jitter plot and box and whisker plot example

Code

set.seed(1)

pretend_lengths <- cbind(
  juveniles = rnorm(20, mean = 2, sd = 0.5), 
  females = rnorm(20, mean = 8, sd = 1), 
  males = rnorm(20, mean = 4, sd = 1)
) %>% 
  as_tibble() %>% 
  pivot_longer(cols = 1:3)

ggplot(pretend_lengths, aes(x = name, y = value, color = name)) +
  geom_jitter(width = 0.1, alpha = 0.8, size = 2) +
  scale_color_manual(values = c("darkgreen", "cornflowerblue", "orange")) +
  labs(y = "Weight (g)") +
  theme(axis.title.x = element_blank(),
        legend.position = "none")

Code

ggplot(pretend_lengths, aes(x = name, y = value, color = name, fill = name)) +
  geom_boxplot(alpha = 0.8) +
  scale_color_manual(values = c("darkgreen", "cornflowerblue", "orange")) +
  scale_fill_manual(values = c("darkgreen", "cornflowerblue", "orange")) +
  labs(y = "Weight (g)") +
  theme(axis.title.x = element_blank(),
        legend.position = "none")

7. Probability mass example

Code

ggplot(data.frame(x = 1:55), aes(x)) +
  stat_function(geom = "bar", n = 55, fun = dpois, args = list(lambda = 10), fill = "coral") +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.13)) +
  coord_cartesian(xlim = c(0, 22)) +
  labs(x = "Mussel clump size (count)", y = "Probability mass") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato"))

8. Probability density example

Code

ggplot(data.frame(x = 1:20), aes(x)) +
  stat_function(geom = "line", n = 100, fun = dnorm, args = list(mean = 10, sd = 2), linewidth = 1) +
  stat_function(geom = "area", fun = dnorm, args = list(mean = 10, sd = 2), xlim = c(12, 14), fill = "turquoise3") +
  geom_vline(xintercept = 12, lty = 2, color = "grey", linewidth = 1) +
  geom_vline(xintercept = 14, lty = 2, color = "grey", linewidth = 1) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.22)) +
  # coord_cartesian(xlim = c(0, 22)) +
  labs(x = "Individual mussel weight (g)", y = "Probability density")

9. probability distribution

Code

set.seed(1)
normdist <- rnorm(n = 100000, mean = 0, sd = 1) %>% 
  as_tibble(rownames = "x")

ggplot(normdist) +
  geom_histogram(aes(x = value, after_stat(density)), fill = "white", color = "black", bins = 100) +
  stat_function(fun = dnorm, args = list(mean = 0, sd = 1), color = "blue", linewidth = 2) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) +
  labs(x = "Continuous value", y = "Density")

10. normal distribution

Code

ggplot(data.frame(x = -10:25), aes(x)) +
  stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 0, sd = 1), linewidth = 1, color = "darkorange") +
  annotate("text", x = 4.5, y = 0.4, label = "\U03BC = 0, \U03C3 = 1", color = "darkorange", size = 6) +
  stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 15, sd = 3), linewidth = 1, color = "blue") +
  annotate("text", x = 16, y = 0.15, label = "\U03BC = 15, \U03C3 = 3", color = "blue", size = 6) +
  stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 5, sd = 5), linewidth = 1, color = "darkgreen") +
  annotate("text", x = 7, y = 0.1, label = "\U03BC = 5, \U03C3 = 5", color = "darkgreen", size = 6) +
  scale_x_continuous(breaks = seq(-10, 25, 5)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) +
  labs(x = "Continuous value", y = "Density")

11. z-score calculation

figure

We’ll use $z = -1.23$ for this example.

Code

# z-score
q <- -1.23

ggplot(data.frame(x = -4:4), aes(x)) +
  # zscore
  geom_linerange(x = q, ymin = 0, ymax = 0.19) +
  # area under the curve
  stat_function(geom = "area", fun = dnorm, args = list(mean = 0, sd = 1), xlim = c(-4, -1.23), fill = "turquoise3") +
  # Z distribution curve
  stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 0, sd = 1), linewidth = 1.5, color = "darkorange") +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.45)) +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title = element_blank(),
        axis.line.y = element_blank())

calculation

Code

pnorm(q, mean = 0, sd = 1)

[1] 0.1093486

You can compare this with the Z-score table.

chiton example

What is the probability of selecting a chiton that is less than 6 ft long given a normally distributed population with $\mu = 12$ g with $\sigma = 3$ g?

Code

# calculate the z-score
chiton_z <- (6 - 12)/3
  
# calculate the probability under the curve
pnorm(chiton_z, mean = 0, sd = 1)

[1] 0.02275013

12. 68-95-99.7 rule

In a normal distribution, 68% of values lie within 1 standard deviation of the mean, 95% within 2 standard deviations, and 99.7% within 3 standard deviations.

Code

labels <- c(
  "", "\U03BC - 3\U03C3", "\U03BC - 2\U03C3", "\U03BC - \U03C3", "\U03BC", "\U03BC + \U03C3", "\U03BC + 2\U03C3", "\U03BC + 3\U03C3", ""
)

ggplot(data.frame(x = -4:4), aes(x)) +
  geom_linerange(x = 1, ymin = 0, ymax = 0.24) +
  geom_linerange(x = -1, ymin = 0, ymax = 0.24) +
  geom_linerange(x = 2, ymin = 0, ymax = 0.055) +
  geom_linerange(x = -2, ymin = 0, ymax = 0.055) +
  geom_linerange(x = 3, ymin = 0, ymax = 0.005) +
  geom_linerange(x = -3, ymin = 0, ymax = 0.005) +
  geom_linerange(x = 0, ymin = 0, ymax = 0.399) +
  stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 0, sd = 1), linewidth = 1.5, color = "darkorange") +
  scale_x_continuous(labels = labels, breaks = seq(-4, 4, by = 1)) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.41)) +
  labs(x = "") +
  theme_classic() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 24),
        axis.line.y = element_blank(),
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank())

13. Student’s t distribution

Code

ggplot(data.frame(x = -10:10), aes(x)) +
  stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 1), linewidth = 1, color = "#856F33") +
  annotate("text", x = 3.5, y = 0.3, label = "\U03BD = 1", color = "#856F33", size = 6) +
  stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 3), linewidth = 1, color = "#E6821C") + 
  annotate("text", x = 3.5, y = 0.35, label = "\U03BD = 3", color = "#E6821C", size = 6) +
  stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 5), linewidth = 1, color = "#56E9E7") +
  annotate("text", x = 3.5, y = 0.37, label = "\U03BD = 5", color = "#56E9E7", size = 6) +
  stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 100), linewidth = 1, color = "#04B37F") +
    annotate("text", x = 3.5, y = 0.4, label = "\U03BD = 100", color = "#04B37F", size = 6) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) +
  labs(x = "Continuous value", y = "Density") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato"))

14. Uniform distribution

Code

ggplot(data.frame(x = 0:10), aes(x)) +
  stat_function(geom = "line", n = 1000, fun = dunif, args = list(min = 2, max = 8), linewidth = 1, color = "firebrick4") +
  annotate("text", x = 2, y = 0.172, label = "a = 2", color = "firebrick4", size = 6) + 
  annotate("text", x = 8, y = 0.172, label = "b = 8", color = "firebrick4", size = 6) + 
  scale_x_continuous(breaks = seq(0, 10, 2)) +
  scale_y_continuous(expand = c(0, 0), limits = c(-0.001, 0.18)) +
  labs(x = "Continuous value", y = "Density") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato"))

14. Binomial distribution

Code

ggplot(data.frame(x = 1:20), aes(x)) +
  stat_function(geom = "line", n = 20, fun = dbinom, args = list(size = 20, p = 0.1), color = "black") +
  stat_function(geom = "point", n = 20, fun = dbinom, args = list(size = 20, p = 0.1), color = "#6D9929", size = 3) +
  annotate("text", x = 5.5, y = 0.29, label = "n = 20, p = 0.1", color = "#6D9929", size = 6) +
  stat_function(geom = "line", n = 20, fun = dbinom, args = list(size = 20, p = 0.4), color = "black") +
  stat_function(geom = "point", n = 20, fun = dbinom, args = list(size = 20, p = 0.4), color = "#4A76E5", size = 3) +
  annotate("text", x = 8, y = 0.2, label = "n = 20, p = 0.4", color = "#4A76E5", size = 6) +
  stat_function(geom = "line", n = 20, fun = dbinom, args = list(size = 20, p = 0.7), color = "black") +
  stat_function(geom = "point", n = 20, fun = dbinom, args = list(size = 20, p = 0.7), color = "#E67960", size = 3) +
  annotate("text", x = 15, y = 0.21, label = "n = 20, p = 0.7", color = "#E67960", size = 6) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.32)) +
  labs(x = "Number of successes", y = "Mass") +
  theme_bw() +
  theme(panel.grid = element_blank(),
        axis.text = element_text(size = 18),
        axis.title = element_text(size = 18),
        text = element_text(family = "Lato"))

15. Poisson distribution

Code

ggplot(data.frame(x = 1:20), aes(x)) +
  stat_function(geom = "line", n = 20, fun = dpois, args = list(lambda = 1), color = "black") +
  stat_function(geom = "point", n = 20, fun = dpois, args = list(lambda = 1), color = "coral", size = 4) +
  annotate("text", x = 3, y = 0.37, label = "\U03BB = 1", color = "coral", size = 6) +
  stat_function(geom = "line", n = 20, fun = dpois, args = list(lambda = 4), color = "black") +
  stat_function(geom = "point", n = 20, fun = dpois, args = list(lambda = 4), color = "darkgreen", size = 4) +
  annotate("text", x = 6, y = 0.2, label = "\U03BB = 4", color = "darkgreen", size = 6) +
  stat_function(geom = "line", n = 20, fun = dpois, args = list(lambda = 10), color = "black") +
  stat_function(geom = "point", n = 20, fun = dpois, args = list(lambda = 10), color = "turquoise", size = 4) +
  annotate("text", x = 14, y = 0.12, label = "\U03BB = 10", color = "turquoise", size = 6) +
  scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) +
  labs(x = "Discrete value", y = "Mass")

Citation

BibTeX citation:

@online{bui2024,
  author = {Bui, An},
  title = {Week 1 Figures - {Lectures} 1 and 2},
  date = {2024-04-01},
  url = {https://spring-2024.envs-193ds.com/lecture/lecture_week-01.html},
  langid = {en}
}

For attribution, please cite this work as:

Bui, An. 2024. “Week 1 Figures - Lectures 1 and 2.” April 1, 2024. https://spring-2024.envs-193ds.com/lecture/lecture_week-01.html.

--- title: "Week 1 figures - Lectures 1 and 2" editor: source freeze: auto execute: message: false warning: false format: html: code-fold: true author: - name: An Bui url: https://an-bui.com/ affiliation: UC Santa Barbara, Ecology, Evolution, and Marine Biology affiliation-url: https://www.eemb.ucsb.edu/ published-title: "Lecture date" date: 2024-04-01 date-modified: last-modified categories: [central tendency, spread, regression, histogram, jitterplot, box-and-whisker plot, probability mass, probability density, probability distribution, normal distribution, z-score calculation, t-distribution, uniform distribution, binomial distribution, poisson distribution] citation: url: https://spring-2024.envs-193ds.com/lecture/lecture_week-01.html --- ## 0. set up ```{r set-up, message = FALSE} # cleaning library(tidyverse) theme_set(theme_classic() + theme(panel.grid = element_blank(), axis.text = element_text(size = 18), axis.title = element_text(size = 18), text = element_text(family = "Lato"))) # visualization library(patchwork) ``` ## 1. Math ### sample mean $$ \bar{y} = \frac{1}{n}\sum_{i = 1}^ny_i $$ ### sample variance $$ s^2 = \frac{\sum(y_i - \bar{y})^2}{n - 1} $$ ### sample standard deviation $$ s = \sqrt{\frac{\sum(y_i - \bar{y})^2}{n - 1}} $$ ### coefficient of variation $$ CV = \frac{\sigma}{\mu} $$ ### z-score for selecting a single individual $$ z = \frac{y_i - \mu}{\sigma} $$ ## 2. Mean and median For data following a symmetrical distribution, the mean and median tend to be similar. ```{r mean-med-comparison} set.seed(1) rnorm(n = 40, mean = 6, sd = 1) %>% as_tibble() %>% ggplot(aes(x = value)) + geom_density() + geom_vline(aes(xintercept = mean(value)), color = "blue") + annotate("text", x = 5.75, y = 0.5, label = "mean", color = "blue") + geom_vline(aes(xintercept = median(value))) + annotate("text", x = 6.5, y = 0.5, label = "median") + scale_x_continuous(limits = c(2.5, 10)) + scale_y_continuous(limits = c(0, 0.5)) + labs(x = "Sculpin lengths (cm)") + theme(axis.text.y = element_blank(), axis.title.y = element_blank(), axis.ticks.y = element_blank(), axis.line.y = element_blank()) ``` ## 3. Range ```{r range-plots} #| fig-width: 10 #| fig-height: 5 set.seed(1) narrow <- rnorm(n = 30, mean = 6, sd = 1) %>% as_tibble() %>% mutate(y = 0) %>% ggplot(aes(x = value, y = y)) + geom_jitter(shape = 21) + geom_point(aes(x = mean(value), y = 0), color = "blue", size = 3) + scale_x_continuous(limits = c(0, 15)) + scale_y_continuous(limits = c(-0.5, 0.5)) + theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank(), axis.title.y = element_blank()) + labs(x = "Sculpin lengths (cm)") # min: 3.78 # max: 7.60 set.seed(1) wide <- rnorm(n = 30, mean = 6, sd = 2) %>% as_tibble() %>% mutate(y = 0) %>% ggplot(aes(x = value, y = y)) + geom_jitter(shape = 21) + geom_point(aes(x = mean(value), y = 0), color = "blue", size = 3) + scale_x_continuous(limits = c(0, 15)) + scale_y_continuous(limits = c(-0.5, 0.5)) + theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank(), axis.title.y = element_blank()) + labs(x = "Sculpin lengths (cm)") # min: 1.57 # max: 9.19 narrow + wide ``` ### How would you describe this data? ```{r data-description} set.seed(1) ex1 <- rf(n = 100, df1 = 30, df2 = 10) mean(ex1) median(ex1) ex1 %>% enframe() %>% ggplot(aes(x = value)) + geom_histogram(bins = 9, color = "#000000", fill = "orange") + scale_y_continuous(expand = c(0, 0)) + labs(x = "Hermit crab shell length (cm)") set.seed(1) ex2 <- rnorm(n = 100, mean = 25, sd = 5) mean(ex2) median(ex2) ex2 %>% enframe() %>% ggplot(aes(x = value)) + geom_histogram(bins = 9, color = "#000000", fill = "darkgreen") + scale_y_continuous(expand = c(0, 0)) + labs(x = "Octopus arm length (cm)") ``` ## 4. anemone regression example ```{r anemone-regression} # number of arms arms <- seq(from = 40, to = 100, by = 1) # diameter: anemones can be up to 8 cm long set.seed(10) diam <- rnorm(length(arms), mean = seq(from = 1, to = 5, length = length(arms)), sd = 1) # create a data frame df <- cbind(diam, arms) %>% as.data.frame() ggplot(df, aes(x = arms, y = diam)) + geom_point(size = 2) + labs(x = "Number of arms", y = "Diameter (cm)") ggplot(df, aes(x = arms, y = diam)) + geom_point(size = 2) + # just using geom smooth for the purposes of visualization geom_smooth(method = "lm", se = FALSE, linewidth = 2) + labs(x = "Number of arms", y = "Diameter (cm)") + theme_bw() + theme(panel.grid = element_blank(), axis.text = element_text(size = 18), axis.title = element_text(size = 18), text = element_text(family = "Lato")) ``` ## 5. histogram example The **Rice rule** guidelines for the calculating the number of bins in a histogram: $$ bins = 2n^{1/3} $$ where $n$ is the number of observations. This is an example of a histogram that does follow the rice rule, where the bin number is 8. ```{r histogram} ggplot(df, aes(x = diam)) + scale_x_continuous(breaks = seq(from = 0, to = 8, by = 1)) + scale_y_continuous(expand = c(0, 0), limits = c(0, 19), breaks = seq(from = 0, to = 18, by = 3)) + geom_histogram(breaks = seq(from = 0, to = 8, by = 1), color = "#000000", fill = "lightblue") + labs(x = "Anemone diameter (cm)", y = "Count") + theme(panel.grid = element_blank(), axis.text = element_text(size = 18), axis.title = element_text(size = 18), text = element_text(family = "Lato")) ``` These histograms do not, and it proves difficult to see the distribution: ```{r hist-too-many-bins} ggplot(df, aes(x = diam)) + scale_x_continuous(breaks = seq(from = 0, to = 8, by = 1)) + scale_y_continuous(expand = c(0, 0), limits = c(0, 19), breaks = seq(from = 0, to = 18, by = 3)) + geom_histogram(color = "#000000", fill = "lightblue") + labs(x = "Anemone diameter (cm)", y = "Count") + theme(panel.grid = element_blank(), axis.text = element_text(size = 18), axis.title = element_text(size = 18), text = element_text(family = "Lato")) ggplot(df, aes(x = diam)) + scale_x_continuous(breaks = seq(from = 0, to = 8, by = 1)) + scale_y_continuous(expand = c(0, 0)) + geom_histogram(color = "#000000", fill = "lightblue", bins = 3) + labs(x = "Anemone diameter (cm)", y = "Count") + theme(panel.grid = element_blank(), axis.text = element_text(size = 18), axis.title = element_text(size = 18), text = element_text(family = "Lato")) ``` ## 6. jitter plot and box and whisker plot example ```{r jitterplot} set.seed(1) pretend_lengths <- cbind( juveniles = rnorm(20, mean = 2, sd = 0.5), females = rnorm(20, mean = 8, sd = 1), males = rnorm(20, mean = 4, sd = 1) ) %>% as_tibble() %>% pivot_longer(cols = 1:3) ggplot(pretend_lengths, aes(x = name, y = value, color = name)) + geom_jitter(width = 0.1, alpha = 0.8, size = 2) + scale_color_manual(values = c("darkgreen", "cornflowerblue", "orange")) + labs(y = "Weight (g)") + theme(axis.title.x = element_blank(), legend.position = "none") ggplot(pretend_lengths, aes(x = name, y = value, color = name, fill = name)) + geom_boxplot(alpha = 0.8) + scale_color_manual(values = c("darkgreen", "cornflowerblue", "orange")) + scale_fill_manual(values = c("darkgreen", "cornflowerblue", "orange")) + labs(y = "Weight (g)") + theme(axis.title.x = element_blank(), legend.position = "none") ``` ## 7. Probability mass example ```{r prob-mass} #| fig-width: 8 #| fig-height: 4 ggplot(data.frame(x = 1:55), aes(x)) + stat_function(geom = "bar", n = 55, fun = dpois, args = list(lambda = 10), fill = "coral") + scale_y_continuous(expand = c(0, 0), limits = c(0, 0.13)) + coord_cartesian(xlim = c(0, 22)) + labs(x = "Mussel clump size (count)", y = "Probability mass") + theme_bw() + theme(panel.grid = element_blank(), axis.text = element_text(size = 18), axis.title = element_text(size = 18), text = element_text(family = "Lato")) ``` ## 8. Probability density example ```{r prob-density} #| fig-width: 8 #| fig-height: 4 ggplot(data.frame(x = 1:20), aes(x)) + stat_function(geom = "line", n = 100, fun = dnorm, args = list(mean = 10, sd = 2), linewidth = 1) + stat_function(geom = "area", fun = dnorm, args = list(mean = 10, sd = 2), xlim = c(12, 14), fill = "turquoise3") + geom_vline(xintercept = 12, lty = 2, color = "grey", linewidth = 1) + geom_vline(xintercept = 14, lty = 2, color = "grey", linewidth = 1) + scale_y_continuous(expand = c(0, 0), limits = c(0, 0.22)) + # coord_cartesian(xlim = c(0, 22)) + labs(x = "Individual mussel weight (g)", y = "Probability density") ``` ## 9. probability distribution ```{r prob-dist} set.seed(1) normdist <- rnorm(n = 100000, mean = 0, sd = 1) %>% as_tibble(rownames = "x") ggplot(normdist) + geom_histogram(aes(x = value, after_stat(density)), fill = "white", color = "black", bins = 100) + stat_function(fun = dnorm, args = list(mean = 0, sd = 1), color = "blue", linewidth = 2) + scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) + labs(x = "Continuous value", y = "Density") ``` ## 10. normal distribution ```{r normal-dist} ggplot(data.frame(x = -10:25), aes(x)) + stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 0, sd = 1), linewidth = 1, color = "darkorange") + annotate("text", x = 4.5, y = 0.4, label = "\U03BC = 0, \U03C3 = 1", color = "darkorange", size = 6) + stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 15, sd = 3), linewidth = 1, color = "blue") + annotate("text", x = 16, y = 0.15, label = "\U03BC = 15, \U03C3 = 3", color = "blue", size = 6) + stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 5, sd = 5), linewidth = 1, color = "darkgreen") + annotate("text", x = 7, y = 0.1, label = "\U03BC = 5, \U03C3 = 5", color = "darkgreen", size = 6) + scale_x_continuous(breaks = seq(-10, 25, 5)) + scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) + labs(x = "Continuous value", y = "Density") ``` ## 11. z-score calculation ### figure We'll use $z = -1.23$ for this example. ```{r z-score-figure} #| fig-width: 10 #| fig-height: 5 # z-score q <- -1.23 ggplot(data.frame(x = -4:4), aes(x)) + # zscore geom_linerange(x = q, ymin = 0, ymax = 0.19) + # area under the curve stat_function(geom = "area", fun = dnorm, args = list(mean = 0, sd = 1), xlim = c(-4, -1.23), fill = "turquoise3") + # Z distribution curve stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 0, sd = 1), linewidth = 1.5, color = "darkorange") + scale_y_continuous(expand = c(0, 0), limits = c(0, 0.45)) + theme(axis.text.y = element_blank(), axis.ticks.y = element_blank(), axis.title = element_blank(), axis.line.y = element_blank()) ``` ### calculation ```{r z-score calculation} pnorm(q, mean = 0, sd = 1) ``` You can compare this with the [Z-score table](https://www.ztable.net/). ### chiton example What is the probability of selecting a chiton that is **less than 6 ft long** given a normally distributed population with $\mu = 12$ g with $\sigma = 3$ g? ```{r chiton-z-example} # calculate the z-score chiton_z <- (6 - 12)/3 # calculate the probability under the curve pnorm(chiton_z, mean = 0, sd = 1) ``` ## 12. 68-95-99.7 rule In a normal distribution, **68%** of values lie within 1 standard deviation of the mean, **95%** within 2 standard deviations, and **99.7%** within 3 standard deviations. ```{r rule-68-95-99.7} #| fig-width: 16 #| fig-height: 8 labels <- c( "", "\U03BC - 3\U03C3", "\U03BC - 2\U03C3", "\U03BC - \U03C3", "\U03BC", "\U03BC + \U03C3", "\U03BC + 2\U03C3", "\U03BC + 3\U03C3", "" ) ggplot(data.frame(x = -4:4), aes(x)) + geom_linerange(x = 1, ymin = 0, ymax = 0.24) + geom_linerange(x = -1, ymin = 0, ymax = 0.24) + geom_linerange(x = 2, ymin = 0, ymax = 0.055) + geom_linerange(x = -2, ymin = 0, ymax = 0.055) + geom_linerange(x = 3, ymin = 0, ymax = 0.005) + geom_linerange(x = -3, ymin = 0, ymax = 0.005) + geom_linerange(x = 0, ymin = 0, ymax = 0.399) + stat_function(geom = "line", n = 1000, fun = dnorm, args = list(mean = 0, sd = 1), linewidth = 1.5, color = "darkorange") + scale_x_continuous(labels = labels, breaks = seq(-4, 4, by = 1)) + scale_y_continuous(expand = c(0, 0), limits = c(0, 0.41)) + labs(x = "") + theme_classic() + theme(panel.grid = element_blank(), axis.text = element_text(size = 24), axis.line.y = element_blank(), axis.title.y = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank()) ``` ## 13. Student's t distribution ```{r t-dist} ggplot(data.frame(x = -10:10), aes(x)) + stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 1), linewidth = 1, color = "#856F33") + annotate("text", x = 3.5, y = 0.3, label = "\U03BD = 1", color = "#856F33", size = 6) + stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 3), linewidth = 1, color = "#E6821C") + annotate("text", x = 3.5, y = 0.35, label = "\U03BD = 3", color = "#E6821C", size = 6) + stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 5), linewidth = 1, color = "#56E9E7") + annotate("text", x = 3.5, y = 0.37, label = "\U03BD = 5", color = "#56E9E7", size = 6) + stat_function(geom = "line", n = 1000, fun = dt, args = list(df = 100), linewidth = 1, color = "#04B37F") + annotate("text", x = 3.5, y = 0.4, label = "\U03BD = 100", color = "#04B37F", size = 6) + scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) + labs(x = "Continuous value", y = "Density") + theme_bw() + theme(panel.grid = element_blank(), axis.text = element_text(size = 18), axis.title = element_text(size = 18), text = element_text(family = "Lato")) ``` ## 14. Uniform distribution ```{r uniform-dist} ggplot(data.frame(x = 0:10), aes(x)) + stat_function(geom = "line", n = 1000, fun = dunif, args = list(min = 2, max = 8), linewidth = 1, color = "firebrick4") + annotate("text", x = 2, y = 0.172, label = "a = 2", color = "firebrick4", size = 6) + annotate("text", x = 8, y = 0.172, label = "b = 8", color = "firebrick4", size = 6) + scale_x_continuous(breaks = seq(0, 10, 2)) + scale_y_continuous(expand = c(0, 0), limits = c(-0.001, 0.18)) + labs(x = "Continuous value", y = "Density") + theme_bw() + theme(panel.grid = element_blank(), axis.text = element_text(size = 18), axis.title = element_text(size = 18), text = element_text(family = "Lato")) ``` ## 14. Binomial distribution ```{r binomial-dist} ggplot(data.frame(x = 1:20), aes(x)) + stat_function(geom = "line", n = 20, fun = dbinom, args = list(size = 20, p = 0.1), color = "black") + stat_function(geom = "point", n = 20, fun = dbinom, args = list(size = 20, p = 0.1), color = "#6D9929", size = 3) + annotate("text", x = 5.5, y = 0.29, label = "n = 20, p = 0.1", color = "#6D9929", size = 6) + stat_function(geom = "line", n = 20, fun = dbinom, args = list(size = 20, p = 0.4), color = "black") + stat_function(geom = "point", n = 20, fun = dbinom, args = list(size = 20, p = 0.4), color = "#4A76E5", size = 3) + annotate("text", x = 8, y = 0.2, label = "n = 20, p = 0.4", color = "#4A76E5", size = 6) + stat_function(geom = "line", n = 20, fun = dbinom, args = list(size = 20, p = 0.7), color = "black") + stat_function(geom = "point", n = 20, fun = dbinom, args = list(size = 20, p = 0.7), color = "#E67960", size = 3) + annotate("text", x = 15, y = 0.21, label = "n = 20, p = 0.7", color = "#E67960", size = 6) + scale_y_continuous(expand = c(0, 0), limits = c(0, 0.32)) + labs(x = "Number of successes", y = "Mass") + theme_bw() + theme(panel.grid = element_blank(), axis.text = element_text(size = 18), axis.title = element_text(size = 18), text = element_text(family = "Lato")) ``` ## 15. Poisson distribution ```{r poisson-dist} ggplot(data.frame(x = 1:20), aes(x)) + stat_function(geom = "line", n = 20, fun = dpois, args = list(lambda = 1), color = "black") + stat_function(geom = "point", n = 20, fun = dpois, args = list(lambda = 1), color = "coral", size = 4) + annotate("text", x = 3, y = 0.37, label = "\U03BB = 1", color = "coral", size = 6) + stat_function(geom = "line", n = 20, fun = dpois, args = list(lambda = 4), color = "black") + stat_function(geom = "point", n = 20, fun = dpois, args = list(lambda = 4), color = "darkgreen", size = 4) + annotate("text", x = 6, y = 0.2, label = "\U03BB = 4", color = "darkgreen", size = 6) + stat_function(geom = "line", n = 20, fun = dpois, args = list(lambda = 10), color = "black") + stat_function(geom = "point", n = 20, fun = dpois, args = list(lambda = 10), color = "turquoise", size = 4) + annotate("text", x = 14, y = 0.12, label = "\U03BB = 10", color = "turquoise", size = 6) + scale_y_continuous(expand = c(0, 0), limits = c(0, 0.42)) + labs(x = "Discrete value", y = "Mass") ```