Difference in means (simulation)

Complete code for class 10/7


Hypothesis Testing


null_dist <- penguins |>
  filter(species %in% c("Gentoo", "Chinstrap")) |>
  specify(response = bill_length_mm, explanatory = species) |>
  hypothesize(null = "independence") |>
  generate(reps = 1000, type = "permute") |>
  calculate(stat = "diff in means", order = c("Gentoo", "Chinstrap"))
Dropping unused factor levels Adelie from the supplied explanatory variable
Warning: Removed 1 rows containing missing values.


null_dist |>
    aes(x = stat)) +
  geom_histogram() +
  labs(title = "simulated distribution") +
  geom_vline(xintercept = -1.2 , color = "red", lwd = 3) +
  geom_vline(xintercept = 1.2 , color = "red", lwd = 3)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

get_p_value(null_dist, -1.2, "two-sided" )
# A tibble: 1 × 1
1   0.016

Confidence interval


boot_df <- penguins |>
  filter(species %in% c("Gentoo", "Chinstrap")) |>
  specify(response = bill_length_mm, explanatory = species) |>
  generate(reps = 1000, type = "bootstrap") |>
  calculate(stat = "diff in means" , order = c("Gentoo", "Chinstrap"))
Dropping unused factor levels Adelie from the supplied explanatory variable
Warning: Removed 1 rows containing missing values.
boot_df |>
    aes(x = stat) 
  ) + 
  geom_histogram() +
  geom_vline(xintercept = -2.24, color = "red" , lwd = 3) +
  geom_vline(xintercept = -0.329, color = "red", lwd = 3)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

boot_df |>
    lower = quantile(stat, 0.025),
    upper = quantile(stat, 0.975)
# A tibble: 1 × 2
  lower  upper
  <dbl>  <dbl>
1 -2.25 -0.338