From 06ea7df0f7a4610092b89719af953fb4f5960d04 Mon Sep 17 00:00:00 2001
From: Chester Ismay <chester.ismay@gmail.com>
Date: Mon, 21 Oct 2024 15:32:07 -0700
Subject: [PATCH] Add day 3 walkthrough

---
 day3_walkthrough.qmd | 385 ++++++++++++++++++++++++++++---------------
 1 file changed, 254 insertions(+), 131 deletions(-)

diff --git a/day3_walkthrough.qmd b/day3_walkthrough.qmd
index e3e699a..6b51bd0 100644
--- a/day3_walkthrough.qmd
+++ b/day3_walkthrough.qmd
@@ -13,7 +13,7 @@ options(width = 120)
 
 # Day 3: Sampling and Estimation in R
 
-## Session 1: Sampling
+## Session 7: Sampling
 
 ### 1. Load Necessary Packages
 
@@ -25,7 +25,7 @@ library(moderndive)
 library(infer)
 ```
 
-- These packages provide tools for data wrangling, visualization, and modeling.  
+- These packages provide tools for data wrangling, visualization, modeling, and inference.  
 
 ---
 
@@ -59,9 +59,8 @@ store_ball_inventory <- tibble(
 
 ```{r}
 # Use glimpse to explore the structure of the dataset
-glimpse(store_ball_inventory)
-```
 
+```
 
 ---
 
@@ -69,18 +68,16 @@ glimpse(store_ball_inventory)
 
 ```{r}
 # Create a count of ball_type
-store_ball_inventory |> 
-  count(ball_type)
+
 
 # Determine the proportion of pickleballs in the inventory
-p_df <- store_ball_inventory |> 
-  summarize(prop_pickle = mean(ball_type == "Pickleball"))
+
 
 # Convert p to a numeric value
-p <- p_df$prop_pickle
+
 
 # Or using the tidyverse
-p <- p_df |> pull(prop_pickle)
+
 ```
 
 
@@ -90,12 +87,10 @@ p <- p_df |> pull(prop_pickle)
 
 ```{r}
 # Retrieve a sample of 50 balls from the inventory
-ball_sample <- store_ball_inventory |> 
-  slice_sample(n = 50, replace = FALSE)
+
 
 # Determine the proportion of pickleballs in the sample
-ball_sample |> 
-  summarize(prop_pickle = mean(ball_type == "Pickleball"))
+
 ```
 
 ---
@@ -104,12 +99,10 @@ ball_sample |>
 
 ```{r}
 # Retrieve another sample of 50 balls from the inventory
-ball_sample2 <- store_ball_inventory |> 
-  slice_sample(n = 50, replace = FALSE)
+
 
 # Determine the proportion of pickleballs in the sample
-ball_sample2 |> 
-  summarize(prop_pickle = mean(ball_type == "Pickleball"))
+
 ```
 
 
@@ -119,8 +112,7 @@ ball_sample2 |>
 
 ```{r}
 # Use `rep_slice_sample()` from the `infer` package
-ball_samples <- store_ball_inventory |> 
-  rep_slice_sample(n = 50, reps = 1000, replace = FALSE)
+
 ```
 
 
@@ -130,13 +122,10 @@ ball_samples <- store_ball_inventory |>
 
 ```{r}
 # Determine sample proportions with `dplyr`
-props_pickle <- ball_samples |> 
-  summarize(prop_pickle = mean(ball_type == "Pickleball"))
+
 
 # Create a histogram of the sample proportions
-ggplot(props_pickle, aes(x = prop_pickle)) +
-  geom_histogram(bins = 15, color = "white") +
-  labs(x = "Sample proportion", title = "Histogram of 1000 sample proportions of Pickleballs") 
+
 
 ```
 
@@ -148,195 +137,329 @@ ggplot(props_pickle, aes(x = prop_pickle)) +
 ```{r}
 # Using the simulations, calculate the standard deviation of the 
 # sample proportions
-se_sample_props <- props_pickle |> 
-  summarize(sd(prop_pickle)) |> 
-  pull()
+
 
 # Using the formula for the standard error of a sample proportion
-n <- 50
-se_sample_props_formula <- sqrt(p * (1 - p) / n)
+
 ```
 
 
 ---
 
-<!-- Need to be written -->
+### 10. Repeat for Different Sample Sizes
 
-### Session 1 Review Questions
+```{r}
+# Create a function to calculate the standard error of sample proportions
+# using simulation
 
-**(1.1)** What is the purpose of the `na.omit()` function in the code below?
 
-```r
-penguins_data <- penguins |>
-select(species, island, flipper_length_mm, body_mass_g) |>
-na.omit()
+# Standard errors for different sample sizes
+
 ```
 
-A. It replaces missing values with the median.  
-B. It removes any rows that contain missing values.  
-C. It converts missing values to zeros.  
-D. It fills missing values with the previous non-missing value.
+### Session 7 Review Questions
+
+**`(1.1)`** What is the purpose of using the `sample()` function in the code provided?
+
+A. To randomly select a subset of the population without replacement.  
+B. To calculate the mean value of a numeric variable.  
+C. To remove missing values from the dataset.  
+D. To calculate the population parameters.
 
 ---
 
-**(1.2)** What does the `tidy_summary()` function do when applied to numeric columns in the `penguins_data` data frame?
+**`(1.2)`** In the context of the sporting goods store example, what does the sample proportion of pickleballs represent?
 
-A. It generates summary statistics for each column in the data frame.  
-B. It prints the first 6 rows of the data frame.  
-C. It provides a compact overview of the data, including column names and data types.  
-D. It creates a scatterplot of the variables in the data frame.
+A. The actual number of pickleballs in the entire inventory.  
+B. The proportion of pickleballs in a random sample taken from the inventory.  
+C. The total number of all types of sport balls in the inventory.  
+D. The probability of not selecting a pickleball from the population.
 
 ---
 
-**(1.3)** Which of the following correctly calculates the mean body mass in the `penguins_data` data frame?
+**`(1.3)`** What does the function `rep_slice_sample()` do in the sampling process?
 
-A. `summarize |> penguins_data(mean_flipper = mean(body_mass_g))`  
-B. `summarize(mean(body_mass_g))`  
-C. `summarize(penguins_data, mean = body_mass_g)`
-D. `penguins_data |> summarize(mean_body_mass = mean(body_mass_g))`  
+A. It generates multiple samples from a population, each with the same size.  
+B. It creates a histogram of sample proportions.  
+C. It removes duplicate samples from the dataset.  
+D. It replicates the population to simulate different proportions.
 
 ---
 
-**(1.4)** What does the `get_correlation()` function do when applied in the code below?
+**`(1.4)`** Why is the standard error calculated when taking samples from a population?
 
-```r
-penguins_data |>
-get_correlation(formula = flipper_length_mm ~ body_mass_g)
-```
+A. To ensure that the sample is randomly selected.
+B. To estimate the total number of items in the population.  
+C. To adjust the sample size for better accuracy.  
+D. To measure how much sample proportions vary from the population proportion.  
 
-A. It fits a linear regression model.  
-B. It gives a measure of the linear relationship between flipper length and body mass.
-C. It plots a scatterplot of flipper length and body mass.  
-D. It generates summary statistics for flipper length and body mass.
+---
+
+**`(1.5)`** How does increasing the sample size affect the standard error of the sample proportions?
+
+A. It increases the standard error because more data points create more variation.  
+B. It decreases the standard error, leading to more precise estimates of the population proportion.  
+C. It has no effect on the standard error.  
+D. It changes the population proportion directly.
 
 ---
 
-**(1.5)** In the following code, what is the purpose of `geom_smooth(method = "lm", se = FALSE)`?
+## Session 8: Estimation using Theory-Based Methods
+
+### 11. Population Data with Numeric Variable of Interest
+
+```{r}
+# Create a tibble of 9500 adults and their corresponding commute times 
+# in minutes
+# This acts as a population of adults and their commute times
+commute_data <- tibble(
+  person_ID = 1:9500,
+  commute_time = rnorm(n = 9500, mean = 30, sd = 10)
+)
+```
+
+### 12. The Sample and the Sample Statistic
+
+```{r}
+# Choose sample size
+
+
+# Generate a sample
+
+
+# Calculate the sample mean
+
+
+# Calculate the standard deviation 
+
+```
+
+
+### 13. Population Parameter
+
+```{r}
+# Calculate the population mean
+
+
+# Calculate the population standard deviation
+
+```
+
+### 14. Confidence Interval for the Population Proportion (Assuming We Know $\sigma$)
+
+```{r}
+# Calculate the margin of error
+
+
+# Recall the point estimate
+
+
+# Calculate the confidence interval
+
+
+# Display the confidence interval
+
+
+# Remember the population parameter (we usually don't know it)
+
+```
+
+### 15. Confidence Interval for the Population Proportion (Assuming We Don't Know $\sigma$)
+
+```{r}
+# Calculate the margin of error
+
+
+# Same point estimate
+
+
+# Calculate the confidence interval
+
+
+# Display the confidence interval
+
+
+# Remember the population parameter (we usually don't know it)
 
-```r
-ggplot(penguins_data, aes(x = flipper_length_mm, y = body_mass_g)) +
-geom_point(alpha = 0.5) +
-geom_smooth(method = "lm", se = FALSE) +
-labs(x = "Flipper Length (mm)", y = "Body Mass (g)",
-title = "Flipper Length vs. Body Mass with Regression Line")
 ```
 
-A. It adds a smoothed curve based on a polynomial fit to the scatterplot.  
-B. It adds a linear regression line to the scatterplot without displaying the confidence interval.  
-C. It adjusts the transparency of the points in the scatterplot.  
-D. It calculates and displays residuals on the plot.
+### 16. Interpreting the Confidence Interval
+
 
 ---
 
-## Session 2: Estimation using Theory-Based Methods
+### Session 8 Review Questions
+
+**`(8.1)`** What does the sample mean represent in general?
 
+A. The mean for the entire population.  
+B. The mean for the smaller collection from the larger group of interest.  
+C. The mean for those outside the sample.  
+D. The population parameter.
 
 ---
 
-### Session 2 Review Questions
+**`(8.2)`** Which of the following describes the purpose of calculating a margin of error?
 
+A. To estimate the standard deviation of the sample.  
+B. To account for the variability in sample means and create a confidence interval.  
+C. To find the population mean directly.  
+D. To calculate the proportion of people with a commute time under the sample mean.
 
-**(2.1)** What is the default baseline island in the simple linear regression model for predicting `body_mass_g` using `island` as a categorical variable?
+---
+
+**`(8.3)`** How is the margin of error calculated when the population standard deviation is known?
 
-A. Torgersen  
-B. Biscoe  
-C. Dream  
-D. The island with the highest body mass
+A. Using a t-distribution and the sample standard deviation.  
+B. Using a z-distribution and the sample standard deviation.  
+C. Using a z-distribution and the population standard deviation.  
+D. Using a t-distribution and the population standard deviation.
 
 ---
 
-**(2.2)** In the following regression model with interaction terms, what does the coefficient for `flipper_length_mm:speciesGentoo` represent?
+**`(8.4)`** When using the $t$-distribution for confidence intervals, why is it used instead of the $z$-distribution?
+
+A. The t-distribution adjusts for larger sample sizes.  
+B. The t-distribution is used when the sample standard deviation is smaller than the population standard deviation.
+C. The t-distribution accounts for the population mean being known.  
+D. The t-distribution is used when the population standard deviation is unknown.  
+
+---
+
+**`(8.5)`** What does it mean to be "95% confident" in the confidence interval calculated?
+
+A. That 95% of similarly constructed confidence intervals from repeated samples would contain the true population mean. 
+B. That the population mean is exactly equal to the sample mean.  
+C. That 95% of the data points in the sample fall within the interval.  
+D. That 95% of the sample means from different samples will be the same as the sample mean in this confidence interval.
+
+---
+
+## Session 9: Estimation Using Bootstrapping Methods
+
+### 17. Assume We Only Have A Sample
+
+```{r}
+# Assume we only have a sample of 100 adults and their commute times
 
-```r
-multi_model_interaction <- lm(body_mass_g ~ flipper_length_mm * species, data = penguins_data)
 ```
 
-A. The slope of the regression line for Gentoo penguins  
-B. The average body mass of Gentoo penguins  
-C. The change in the relationship between flipper length and body mass for Gentoo penguins compared to the baseline  
-D. The difference in intercept for Gentoo penguins compared to the baseline  
+---
+
+### 18. Going Over the `infer` Framework
 
+![infer_framework](https://moderndive.com/v2/images/flowcharts/infer/visualize.png)
 ---
 
-**(2.3)** What is the purpose of adding interaction terms to a multiple regression model?
+### 19. Bootstrapping the Sample
 
-A. To make the model more complex without any real benefit  
-B. To estimate the relationship between each explanatory variable and the response variable independently  
-C. To account for how the relationship between one explanatory variable and the response depends on another explanatory variable  
-D. To automatically improve the model's R-squared value
+```{r}
+# Bootstrapping the sample
+
+```
 
 ---
 
-**(2.4)** What does the following plot indicate about the relationship between flipper length and body mass for different islands?
+### 20. Determine the Mean of the Bootstrap Sample
 
 ```{r}
-ggplot(penguins_data, aes(x = flipper_length_mm, y = body_mass_g, color = island)) +
-  geom_point() +
-  geom_smooth(method = "lm", se = FALSE) +
-  labs(x = "Flipper Length (mm)", y = "Body Mass (g)", color = "Island")
+# Calculate the mean of the bootstrap sample
+
 ```
 
-A. The relationship between flipper length and body mass is the same for all islands 
-B. The relationship between flipper length and body mass differs across species, with each island having a different slope  
-C. The islands do not influence the relationship between flipper length and body mass  
-D. Only the intercept differs for each island, but the slope is the same
+---
+
+### 21. Bootstrapping 1000 Samples
+
+```{r}
+# Bootstrapping 1000 samples
+
+```
+
+### 22. Get the Mean of Each Bootstrap Sample
+
+```{r}
+# Calculate the mean of each bootstrap sample
+
+```
+
+---
+
+### 23. Visualizing the Bootstrap Distribution
+
+```{r}
+# Create a histogram of the bootstrap means
+
+```
 
 ---
 
-**(2.5)** In the multiple regression model with interaction terms, how are dummy variables used for categorical predictors?
+### 24. Calculate the Bootstrap Confidence Interval
+
+```{r}
+# Calculate the bootstrap confidence interval in two ways since bell-shaped
 
-A. Dummy variables are used to represent the different categories of a categorical variable 
-B. Dummy variables are used to represent each numerical variable  
-C. Dummy variables are used to replace the intercept in the regression model  
-D. Dummy variables are not necessary for categorical variables
+```
 
 ---
 
-## Session 3: Multiple Linear Regression Analysis (Part 2)
+### 25. Interpretation of the Bootstrap Confidence Interval
+
 
 ---
 
-### Session 3 Review Questions
+### 26. Visualize Confidence Interval on Top of Bootstrap Distribution
 
-**`(3.1)`** What is the main assumption in a multiple regression model without interaction terms?
+```{r}
+# Show the histogram of bootstrap means with the confidence interval
+# and the population parameter (not usually known)
 
-A. The relationship between the explanatory variables and the response is always quadratic.  
-B. The slope of the regression line is different for each level of the categorical variable.  
-C. The slope of the regression line is the same for all levels of the categorical variable.  
-D. The intercept of the regression line is the same for all levels of the categorical variable.
+```
 
 ---
 
-**`(3.2)`** In the model `lm(body_mass_g ~ flipper_length_mm + species, data = penguins_data)`, what does the coefficient for `speciesGentoo` represent?
+### Session 9 Review Questions
+
+**`(9.1)`** What is the purpose of bootstrapping?
 
-A. The effect of being a Gentoo penguin on body mass compared to the baseline species (Adélie).  
-B. The effect of flipper length on body mass for Gentoo penguins only.  
-C. The change in flipper length due to species.  
-D. The effect of body mass on flipper length for Gentoo penguins.
+A. To calculate the exact population mean from a sample.  
+B. To generate multiple samples from the population without replacement.  
+C. To estimate the sampling distribution of the sample mean by resampling the original sample.  
+D. To directly calculate the confidence interval without using the sample data.
 
 ---
 
-**`(3.3)`** What does the function `geom_parallel_slopes()` do in a regression plot?
+**`(9.2)`** In the bootstrapping process, what does the `generate(reps = 1000, type = "bootstrap")` function do?
 
-A. It fits regression lines with different slopes for each group.  
-B. It plots regression lines with the same slope but different intercepts for each group.  
-C. It creates a scatterplot without any regression lines.  
-D. It visualizes interaction effects between the variables.
+A. It creates 1000 random samples from the original population.  
+B. It creates 1000 random samples with replacement from the original sample.  
+C. It creates 1000 exact copies of the population.  
+D. It creates 1000 different statistics from the original population.
 
 ---
 
-**`(3.4)`** In the model `lm(body_mass_g ~ flipper_length_mm + bill_length_mm, data = penguins_data)`, what do the coefficients for `flipper_length_mm` and `bill_length_mm` represent?
+**`(9.3)`** What does the histogram of bootstrap sample means represent?
 
-A. The total body mass of each penguin species.  
-B. The predicted body mass for penguins with average bill and flipper lengths.
-C. The interaction effect between flipper length and bill length.  
-D. The effect of bill length on body mass, controlling for flipper length, and vice versa. 
+A. The distribution of sample means from the 1000 bootstrap samples.  
+B. The distribution of values in the original population.  
+C. The distribution of the population means calculated from the original sample.  
+D. The actual population mean with 95% certainty.
 
 ---
 
-**`(3.5)`** What does a regression model without interaction terms assume about the relationship between the explanatory variables and the response variable?
+**`(9.4)`** How is the bootstrap percentile confidence interval calculated?
+
+A. By calculating the standard deviation of the bootstrap samples.  
+B. By using the $t$-distribution to calculate the margin of error.  
+C. By taking the 2.5th and 97.5th percentiles of the bootstrap sample means.  
+D. By calculating the $z$-distribution based on the sample size.
+
+---
+
+**`(9.5)`** What does it mean to be "95% confident" in the bootstrap confidence interval?
+
+A. That 95% of the bootstrap samples contain the population mean.  
+B. That the true population mean lies within the interval for 95% of all bootstrap samples.  
+C. That 95% of the population values fall within the confidence interval.
+D. That if we repeated the bootstrapping process many times, 95% of the confidence intervals would contain the true population mean.  
 
-A. The explanatory variables are not related to the response.  
-B. The relationship between the explanatory variables and the response is independent of one another.  
-C. The explanatory variables interact to affect the response.  
-D. The response variable depends only on the categorical variables.