From 06ea7df0f7a4610092b89719af953fb4f5960d04 Mon Sep 17 00:00:00 2001 From: Chester Ismay Date: Mon, 21 Oct 2024 15:32:07 -0700 Subject: [PATCH] Add day 3 walkthrough --- day3_walkthrough.qmd | 385 ++++++++++++++++++++++++++++--------------- 1 file changed, 254 insertions(+), 131 deletions(-) diff --git a/day3_walkthrough.qmd b/day3_walkthrough.qmd index e3e699a..6b51bd0 100644 --- a/day3_walkthrough.qmd +++ b/day3_walkthrough.qmd @@ -13,7 +13,7 @@ options(width = 120) # Day 3: Sampling and Estimation in R -## Session 1: Sampling +## Session 7: Sampling ### 1. Load Necessary Packages @@ -25,7 +25,7 @@ library(moderndive) library(infer) ``` -- These packages provide tools for data wrangling, visualization, and modeling. +- These packages provide tools for data wrangling, visualization, modeling, and inference. --- @@ -59,9 +59,8 @@ store_ball_inventory <- tibble( ```{r} # Use glimpse to explore the structure of the dataset -glimpse(store_ball_inventory) -``` +``` --- @@ -69,18 +68,16 @@ glimpse(store_ball_inventory) ```{r} # Create a count of ball_type -store_ball_inventory |> - count(ball_type) + # Determine the proportion of pickleballs in the inventory -p_df <- store_ball_inventory |> - summarize(prop_pickle = mean(ball_type == "Pickleball")) + # Convert p to a numeric value -p <- p_df$prop_pickle + # Or using the tidyverse -p <- p_df |> pull(prop_pickle) + ``` @@ -90,12 +87,10 @@ p <- p_df |> pull(prop_pickle) ```{r} # Retrieve a sample of 50 balls from the inventory -ball_sample <- store_ball_inventory |> - slice_sample(n = 50, replace = FALSE) + # Determine the proportion of pickleballs in the sample -ball_sample |> - summarize(prop_pickle = mean(ball_type == "Pickleball")) + ``` --- @@ -104,12 +99,10 @@ ball_sample |> ```{r} # Retrieve another sample of 50 balls from the inventory -ball_sample2 <- store_ball_inventory |> - slice_sample(n = 50, replace = FALSE) + # Determine the proportion of pickleballs in the sample -ball_sample2 |> - summarize(prop_pickle = mean(ball_type == "Pickleball")) + ``` @@ -119,8 +112,7 @@ ball_sample2 |> ```{r} # Use `rep_slice_sample()` from the `infer` package -ball_samples <- store_ball_inventory |> - rep_slice_sample(n = 50, reps = 1000, replace = FALSE) + ``` @@ -130,13 +122,10 @@ ball_samples <- store_ball_inventory |> ```{r} # Determine sample proportions with `dplyr` -props_pickle <- ball_samples |> - summarize(prop_pickle = mean(ball_type == "Pickleball")) + # Create a histogram of the sample proportions -ggplot(props_pickle, aes(x = prop_pickle)) + - geom_histogram(bins = 15, color = "white") + - labs(x = "Sample proportion", title = "Histogram of 1000 sample proportions of Pickleballs") + ``` @@ -148,195 +137,329 @@ ggplot(props_pickle, aes(x = prop_pickle)) + ```{r} # Using the simulations, calculate the standard deviation of the # sample proportions -se_sample_props <- props_pickle |> - summarize(sd(prop_pickle)) |> - pull() + # Using the formula for the standard error of a sample proportion -n <- 50 -se_sample_props_formula <- sqrt(p * (1 - p) / n) + ``` --- - +### 10. Repeat for Different Sample Sizes -### Session 1 Review Questions +```{r} +# Create a function to calculate the standard error of sample proportions +# using simulation -**(1.1)** What is the purpose of the `na.omit()` function in the code below? -```r -penguins_data <- penguins |> -select(species, island, flipper_length_mm, body_mass_g) |> -na.omit() +# Standard errors for different sample sizes + ``` -A. It replaces missing values with the median. -B. It removes any rows that contain missing values. -C. It converts missing values to zeros. -D. It fills missing values with the previous non-missing value. +### Session 7 Review Questions + +**`(1.1)`** What is the purpose of using the `sample()` function in the code provided? + +A. To randomly select a subset of the population without replacement. +B. To calculate the mean value of a numeric variable. +C. To remove missing values from the dataset. +D. To calculate the population parameters. --- -**(1.2)** What does the `tidy_summary()` function do when applied to numeric columns in the `penguins_data` data frame? +**`(1.2)`** In the context of the sporting goods store example, what does the sample proportion of pickleballs represent? -A. It generates summary statistics for each column in the data frame. -B. It prints the first 6 rows of the data frame. -C. It provides a compact overview of the data, including column names and data types. -D. It creates a scatterplot of the variables in the data frame. +A. The actual number of pickleballs in the entire inventory. +B. The proportion of pickleballs in a random sample taken from the inventory. +C. The total number of all types of sport balls in the inventory. +D. The probability of not selecting a pickleball from the population. --- -**(1.3)** Which of the following correctly calculates the mean body mass in the `penguins_data` data frame? +**`(1.3)`** What does the function `rep_slice_sample()` do in the sampling process? -A. `summarize |> penguins_data(mean_flipper = mean(body_mass_g))` -B. `summarize(mean(body_mass_g))` -C. `summarize(penguins_data, mean = body_mass_g)` -D. `penguins_data |> summarize(mean_body_mass = mean(body_mass_g))` +A. It generates multiple samples from a population, each with the same size. +B. It creates a histogram of sample proportions. +C. It removes duplicate samples from the dataset. +D. It replicates the population to simulate different proportions. --- -**(1.4)** What does the `get_correlation()` function do when applied in the code below? +**`(1.4)`** Why is the standard error calculated when taking samples from a population? -```r -penguins_data |> -get_correlation(formula = flipper_length_mm ~ body_mass_g) -``` +A. To ensure that the sample is randomly selected. +B. To estimate the total number of items in the population. +C. To adjust the sample size for better accuracy. +D. To measure how much sample proportions vary from the population proportion. -A. It fits a linear regression model. -B. It gives a measure of the linear relationship between flipper length and body mass. -C. It plots a scatterplot of flipper length and body mass. -D. It generates summary statistics for flipper length and body mass. +--- + +**`(1.5)`** How does increasing the sample size affect the standard error of the sample proportions? + +A. It increases the standard error because more data points create more variation. +B. It decreases the standard error, leading to more precise estimates of the population proportion. +C. It has no effect on the standard error. +D. It changes the population proportion directly. --- -**(1.5)** In the following code, what is the purpose of `geom_smooth(method = "lm", se = FALSE)`? +## Session 8: Estimation using Theory-Based Methods + +### 11. Population Data with Numeric Variable of Interest + +```{r} +# Create a tibble of 9500 adults and their corresponding commute times +# in minutes +# This acts as a population of adults and their commute times +commute_data <- tibble( + person_ID = 1:9500, + commute_time = rnorm(n = 9500, mean = 30, sd = 10) +) +``` + +### 12. The Sample and the Sample Statistic + +```{r} +# Choose sample size + + +# Generate a sample + + +# Calculate the sample mean + + +# Calculate the standard deviation + +``` + + +### 13. Population Parameter + +```{r} +# Calculate the population mean + + +# Calculate the population standard deviation + +``` + +### 14. Confidence Interval for the Population Proportion (Assuming We Know $\sigma$) + +```{r} +# Calculate the margin of error + + +# Recall the point estimate + + +# Calculate the confidence interval + + +# Display the confidence interval + + +# Remember the population parameter (we usually don't know it) + +``` + +### 15. Confidence Interval for the Population Proportion (Assuming We Don't Know $\sigma$) + +```{r} +# Calculate the margin of error + + +# Same point estimate + + +# Calculate the confidence interval + + +# Display the confidence interval + + +# Remember the population parameter (we usually don't know it) -```r -ggplot(penguins_data, aes(x = flipper_length_mm, y = body_mass_g)) + -geom_point(alpha = 0.5) + -geom_smooth(method = "lm", se = FALSE) + -labs(x = "Flipper Length (mm)", y = "Body Mass (g)", -title = "Flipper Length vs. Body Mass with Regression Line") ``` -A. It adds a smoothed curve based on a polynomial fit to the scatterplot. -B. It adds a linear regression line to the scatterplot without displaying the confidence interval. -C. It adjusts the transparency of the points in the scatterplot. -D. It calculates and displays residuals on the plot. +### 16. Interpreting the Confidence Interval + --- -## Session 2: Estimation using Theory-Based Methods +### Session 8 Review Questions + +**`(8.1)`** What does the sample mean represent in general? +A. The mean for the entire population. +B. The mean for the smaller collection from the larger group of interest. +C. The mean for those outside the sample. +D. The population parameter. --- -### Session 2 Review Questions +**`(8.2)`** Which of the following describes the purpose of calculating a margin of error? +A. To estimate the standard deviation of the sample. +B. To account for the variability in sample means and create a confidence interval. +C. To find the population mean directly. +D. To calculate the proportion of people with a commute time under the sample mean. -**(2.1)** What is the default baseline island in the simple linear regression model for predicting `body_mass_g` using `island` as a categorical variable? +--- + +**`(8.3)`** How is the margin of error calculated when the population standard deviation is known? -A. Torgersen -B. Biscoe -C. Dream -D. The island with the highest body mass +A. Using a t-distribution and the sample standard deviation. +B. Using a z-distribution and the sample standard deviation. +C. Using a z-distribution and the population standard deviation. +D. Using a t-distribution and the population standard deviation. --- -**(2.2)** In the following regression model with interaction terms, what does the coefficient for `flipper_length_mm:speciesGentoo` represent? +**`(8.4)`** When using the $t$-distribution for confidence intervals, why is it used instead of the $z$-distribution? + +A. The t-distribution adjusts for larger sample sizes. +B. The t-distribution is used when the sample standard deviation is smaller than the population standard deviation. +C. The t-distribution accounts for the population mean being known. +D. The t-distribution is used when the population standard deviation is unknown. + +--- + +**`(8.5)`** What does it mean to be "95% confident" in the confidence interval calculated? + +A. That 95% of similarly constructed confidence intervals from repeated samples would contain the true population mean. +B. That the population mean is exactly equal to the sample mean. +C. That 95% of the data points in the sample fall within the interval. +D. That 95% of the sample means from different samples will be the same as the sample mean in this confidence interval. + +--- + +## Session 9: Estimation Using Bootstrapping Methods + +### 17. Assume We Only Have A Sample + +```{r} +# Assume we only have a sample of 100 adults and their commute times -```r -multi_model_interaction <- lm(body_mass_g ~ flipper_length_mm * species, data = penguins_data) ``` -A. The slope of the regression line for Gentoo penguins -B. The average body mass of Gentoo penguins -C. The change in the relationship between flipper length and body mass for Gentoo penguins compared to the baseline -D. The difference in intercept for Gentoo penguins compared to the baseline +--- + +### 18. Going Over the `infer` Framework +![infer_framework](https://moderndive.com/v2/images/flowcharts/infer/visualize.png) --- -**(2.3)** What is the purpose of adding interaction terms to a multiple regression model? +### 19. Bootstrapping the Sample -A. To make the model more complex without any real benefit -B. To estimate the relationship between each explanatory variable and the response variable independently -C. To account for how the relationship between one explanatory variable and the response depends on another explanatory variable -D. To automatically improve the model's R-squared value +```{r} +# Bootstrapping the sample + +``` --- -**(2.4)** What does the following plot indicate about the relationship between flipper length and body mass for different islands? +### 20. Determine the Mean of the Bootstrap Sample ```{r} -ggplot(penguins_data, aes(x = flipper_length_mm, y = body_mass_g, color = island)) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) + - labs(x = "Flipper Length (mm)", y = "Body Mass (g)", color = "Island") +# Calculate the mean of the bootstrap sample + ``` -A. The relationship between flipper length and body mass is the same for all islands -B. The relationship between flipper length and body mass differs across species, with each island having a different slope -C. The islands do not influence the relationship between flipper length and body mass -D. Only the intercept differs for each island, but the slope is the same +--- + +### 21. Bootstrapping 1000 Samples + +```{r} +# Bootstrapping 1000 samples + +``` + +### 22. Get the Mean of Each Bootstrap Sample + +```{r} +# Calculate the mean of each bootstrap sample + +``` + +--- + +### 23. Visualizing the Bootstrap Distribution + +```{r} +# Create a histogram of the bootstrap means + +``` --- -**(2.5)** In the multiple regression model with interaction terms, how are dummy variables used for categorical predictors? +### 24. Calculate the Bootstrap Confidence Interval + +```{r} +# Calculate the bootstrap confidence interval in two ways since bell-shaped -A. Dummy variables are used to represent the different categories of a categorical variable -B. Dummy variables are used to represent each numerical variable -C. Dummy variables are used to replace the intercept in the regression model -D. Dummy variables are not necessary for categorical variables +``` --- -## Session 3: Multiple Linear Regression Analysis (Part 2) +### 25. Interpretation of the Bootstrap Confidence Interval + --- -### Session 3 Review Questions +### 26. Visualize Confidence Interval on Top of Bootstrap Distribution -**`(3.1)`** What is the main assumption in a multiple regression model without interaction terms? +```{r} +# Show the histogram of bootstrap means with the confidence interval +# and the population parameter (not usually known) -A. The relationship between the explanatory variables and the response is always quadratic. -B. The slope of the regression line is different for each level of the categorical variable. -C. The slope of the regression line is the same for all levels of the categorical variable. -D. The intercept of the regression line is the same for all levels of the categorical variable. +``` --- -**`(3.2)`** In the model `lm(body_mass_g ~ flipper_length_mm + species, data = penguins_data)`, what does the coefficient for `speciesGentoo` represent? +### Session 9 Review Questions + +**`(9.1)`** What is the purpose of bootstrapping? -A. The effect of being a Gentoo penguin on body mass compared to the baseline species (Adélie). -B. The effect of flipper length on body mass for Gentoo penguins only. -C. The change in flipper length due to species. -D. The effect of body mass on flipper length for Gentoo penguins. +A. To calculate the exact population mean from a sample. +B. To generate multiple samples from the population without replacement. +C. To estimate the sampling distribution of the sample mean by resampling the original sample. +D. To directly calculate the confidence interval without using the sample data. --- -**`(3.3)`** What does the function `geom_parallel_slopes()` do in a regression plot? +**`(9.2)`** In the bootstrapping process, what does the `generate(reps = 1000, type = "bootstrap")` function do? -A. It fits regression lines with different slopes for each group. -B. It plots regression lines with the same slope but different intercepts for each group. -C. It creates a scatterplot without any regression lines. -D. It visualizes interaction effects between the variables. +A. It creates 1000 random samples from the original population. +B. It creates 1000 random samples with replacement from the original sample. +C. It creates 1000 exact copies of the population. +D. It creates 1000 different statistics from the original population. --- -**`(3.4)`** In the model `lm(body_mass_g ~ flipper_length_mm + bill_length_mm, data = penguins_data)`, what do the coefficients for `flipper_length_mm` and `bill_length_mm` represent? +**`(9.3)`** What does the histogram of bootstrap sample means represent? -A. The total body mass of each penguin species. -B. The predicted body mass for penguins with average bill and flipper lengths. -C. The interaction effect between flipper length and bill length. -D. The effect of bill length on body mass, controlling for flipper length, and vice versa. +A. The distribution of sample means from the 1000 bootstrap samples. +B. The distribution of values in the original population. +C. The distribution of the population means calculated from the original sample. +D. The actual population mean with 95% certainty. --- -**`(3.5)`** What does a regression model without interaction terms assume about the relationship between the explanatory variables and the response variable? +**`(9.4)`** How is the bootstrap percentile confidence interval calculated? + +A. By calculating the standard deviation of the bootstrap samples. +B. By using the $t$-distribution to calculate the margin of error. +C. By taking the 2.5th and 97.5th percentiles of the bootstrap sample means. +D. By calculating the $z$-distribution based on the sample size. + +--- + +**`(9.5)`** What does it mean to be "95% confident" in the bootstrap confidence interval? + +A. That 95% of the bootstrap samples contain the population mean. +B. That the true population mean lies within the interval for 95% of all bootstrap samples. +C. That 95% of the population values fall within the confidence interval. +D. That if we repeated the bootstrapping process many times, 95% of the confidence intervals would contain the true population mean. -A. The explanatory variables are not related to the response. -B. The relationship between the explanatory variables and the response is independent of one another. -C. The explanatory variables interact to affect the response. -D. The response variable depends only on the categorical variables.