diff --git a/inference_causality/inference_causality.html b/inference_causality/inference_causality.html
deleted file mode 100644
index c06b00a9..00000000
--- a/inference_causality/inference_causality.html
+++ /dev/null
@@ -1,1590 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
-
-<meta charset="utf-8">
-<meta name="generator" content="quarto-1.3.450">
-
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
-
-
-<title>Causal Inference and Confounding</title>
-<style>
-code{white-space: pre-wrap;}
-span.smallcaps{font-variant: small-caps;}
-div.columns{display: flex; gap: min(4vw, 1.5em);}
-div.column{flex: auto; overflow-x: auto;}
-div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
-ul.task-list{list-style: none;}
-ul.task-list li input[type="checkbox"] {
-  width: 0.8em;
-  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
-  vertical-align: middle;
-}
-/* CSS for syntax highlighting */
-pre > code.sourceCode { white-space: pre; position: relative; }
-pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
-pre > code.sourceCode > span:empty { height: 1.2em; }
-.sourceCode { overflow: visible; }
-code.sourceCode > span { color: inherit; text-decoration: inherit; }
-div.sourceCode { margin: 1em 0; }
-pre.sourceCode { margin: 0; }
-@media screen {
-div.sourceCode { overflow: auto; }
-}
-@media print {
-pre > code.sourceCode { white-space: pre-wrap; }
-pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
-}
-pre.numberSource code
-  { counter-reset: source-line 0; }
-pre.numberSource code > span
-  { position: relative; left: -4em; counter-increment: source-line; }
-pre.numberSource code > span > a:first-child::before
-  { content: counter(source-line);
-    position: relative; left: -1em; text-align: right; vertical-align: baseline;
-    border: none; display: inline-block;
-    -webkit-touch-callout: none; -webkit-user-select: none;
-    -khtml-user-select: none; -moz-user-select: none;
-    -ms-user-select: none; user-select: none;
-    padding: 0 4px; width: 4em;
-  }
-pre.numberSource { margin-left: 3em;  padding-left: 4px; }
-div.sourceCode
-  {   }
-@media screen {
-pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
-}
-</style>
-
-
-<script src="inference_causality_files/libs/clipboard/clipboard.min.js"></script>
-<script src="inference_causality_files/libs/quarto-html/quarto.js"></script>
-<script src="inference_causality_files/libs/quarto-html/popper.min.js"></script>
-<script src="inference_causality_files/libs/quarto-html/tippy.umd.min.js"></script>
-<script src="inference_causality_files/libs/quarto-html/anchor.min.js"></script>
-<link href="inference_causality_files/libs/quarto-html/tippy.css" rel="stylesheet">
-<link href="inference_causality_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
-<script src="inference_causality_files/libs/bootstrap/bootstrap.min.js"></script>
-<link href="inference_causality_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
-<link href="inference_causality_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
-<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
-<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script>
-<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
-
-  <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
-  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
-
-</head>
-
-<body>
-
-<div id="quarto-content" class="page-columns page-rows-contents page-layout-full">
-<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
-  <nav id="TOC" role="doc-toc" class="toc-active">
-    <h2 id="toc-title">Causal Inference and the Bootstrap</h2>
-   
-  <ul>
-  <li><a href="#parameter-inference-interpreting-regression-coefficients" id="toc-parameter-inference-interpreting-regression-coefficients" class="nav-link active" data-scroll-target="#parameter-inference-interpreting-regression-coefficients">Parameter Inference: Interpreting Regression Coefficients</a></li>
-  <li><a href="#bootstrap-resampling-review" id="toc-bootstrap-resampling-review" class="nav-link" data-scroll-target="#bootstrap-resampling-review">Bootstrap Resampling (Review)</a>
-  <ul class="collapse">
-  <li><a href="#simple-bootstrap-example" id="toc-simple-bootstrap-example" class="nav-link" data-scroll-target="#simple-bootstrap-example">Simple Bootstrap Example</a></li>
-  </ul></li>
-  <li><a href="#collinearity" id="toc-collinearity" class="nav-link" data-scroll-target="#collinearity">Collinearity</a>
-  <ul class="collapse">
-  <li><a href="#hypothesis-testing-through-bootstrap-snowy-plover-demo" id="toc-hypothesis-testing-through-bootstrap-snowy-plover-demo" class="nav-link" data-scroll-target="#hypothesis-testing-through-bootstrap-snowy-plover-demo">Hypothesis Testing through Bootstrap: Snowy Plover Demo</a></li>
-  <li><a href="#a-simpler-model" id="toc-a-simpler-model" class="nav-link" data-scroll-target="#a-simpler-model">A Simpler Model</a></li>
-  <li><a href="#reminder-assumptions-matter" id="toc-reminder-assumptions-matter" class="nav-link" data-scroll-target="#reminder-assumptions-matter">Reminder: Assumptions Matter</a></li>
-  </ul></li>
-  <li><a href="#bonus-content" id="toc-bonus-content" class="nav-link" data-scroll-target="#bonus-content">[Bonus Content]</a>
-  <ul class="collapse">
-  <li><a href="#prediction-vs-causation" id="toc-prediction-vs-causation" class="nav-link" data-scroll-target="#prediction-vs-causation">Prediction vs Causation</a></li>
-  <li><a href="#confounders" id="toc-confounders" class="nav-link" data-scroll-target="#confounders">Confounders</a></li>
-  <li><a href="#how-to-perform-causal-inference" id="toc-how-to-perform-causal-inference" class="nav-link" data-scroll-target="#how-to-perform-causal-inference">How to perform causal inference?</a></li>
-  </ul></li>
-  </ul>
-</nav>
-</div>
-<main class="content column-page-left" id="quarto-document-content">
-
-<header id="title-block-header" class="quarto-title-block default">
-<div class="quarto-title">
-<div class="quarto-title-block"><div><h1 class="title">Causal Inference and Confounding</h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
-</div>
-
-
-
-<div class="quarto-title-meta">
-
-    
-  
-    
-  </div>
-  
-
-</header>
-
-<!-- 
-The **bias** of an estimator is how far off it is from the parameter, on average.
-
-$$\begin{align}\text{Bias}(\hat{\theta}) = \mathbb{E}[\hat{\theta} - \theta] = \mathbb{E}[\hat{\theta}] - \theta\end{align}$$
-
-For example, the bias of the sample mean as an estimator of the population mean is:
-
-$$\begin{align}\mathbb{E}[\bar{X}_n - \mu]
-&= \mathbb{E}[\frac{1}{n}\sum_{i=1}^n (X_i)] - \mu \\
-&= \frac{1}{n}\sum_{i=1}^n \mathbb{E}[X_i] - \mu \\
-&= \frac{1}{n} (n\mu) - \mu \\
-&= 0\end{align}$$
-
-Because its bias is equal to 0, the sample mean is said to be an **unbiased** estimator of the population mean.
-
-The **variance** of an estimator is a measure of how much the estimator tends to vary from its mean value.
-
-$$\begin{align}\text{Var}(\hat{\theta}) = \mathbb{E}\left[(\hat{\theta} - \mathbb{E}[\hat{\theta}])^2 \right]\end{align}$$
-
-The **mean squared error** measures the "goodness" of an estimator by incorporating both the bias and variance. Formally, it is defined as:
-
-$$\begin{align}\text{MSE}(\hat{\theta}) = \mathbb{E}\left[(\hat{\theta} - \theta)^2
-\right]\end{align}$$ -->
-<div class="callout callout-style-default callout-note no-icon callout-titled">
-<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
-<div class="callout-icon-container">
-<i class="callout-icon no-icon"></i>
-</div>
-<div class="callout-title-container flex-fill">
-Learning Outcomes
-</div>
-<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
-</div>
-<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
-<div class="callout-body-container callout-body">
-<ul>
-<li>Construct confidence intervals for hypothesis testing using bootstrapping</li>
-<li>Understand the assumptions we make and their impact on our regression inference</li>
-<li>Explore ways to overcome issues of multicollinearity</li>
-<li>Compare regression correlation and causation</li>
-</ul>
-</div>
-</div>
-</div>
-<p>Last time, we introduced the idea of random variables and how they affect the data and model we construct. We also demonstrated the decomposition of model risk from a fitted model and dived into the bias-variance tradeoff.</p>
-<p>In this lecture, we will explore regression inference via hypothesis testing, understand how to use bootstrapping under the right assumptions, and consider the environment of understanding causality in theory and in practice.</p>
-<section id="parameter-inference-interpreting-regression-coefficients" class="level2">
-<h2 class="anchored" data-anchor-id="parameter-inference-interpreting-regression-coefficients">Parameter Inference: Interpreting Regression Coefficients</h2>
-<p>There are two main reasons why do we build models:</p>
-<ul>
-<li><strong>Prediction</strong>: using our model to make accurate predictions on unseen data, and (2) to understand complex phenomena occurring in the world we live in.</li>
-<li><strong>Inference</strong>: using our model to draw conclusions about the underlying relationship(s) between our features and response. Its goal is to understand complex phenomena occurring in the world we live in. While training is the process of fiting a model, inference is the <em>process of making predictions</em>.</li>
-</ul>
-<p>Recall the framework we established in the last lecture. The true underlying relationship between the data points is given by <span class="math inline">\(Y = g(x) + \epsilon\)</span>, where <span class="math inline">\(g(x)\)</span> is the <em>true underlying relationship</em>, and <span class="math inline">\(\epsilon\)</span> represents randomness. If we assume <span class="math inline">\(g(x)\)</span> is linear, we can express this relationship in terms of the unknown, true model parameters <span class="math inline">\(\theta\)</span>.</p>
-<p><span class="math display">\[f_{\theta}(x) = g(x) + \epsilon = \theta_0 + \theta_1 x_1 + \ldots + \theta_p x_p + \epsilon\]</span></p>
-<p>Our model attempts to estimate each true population parameter <span class="math inline">\(\theta_i\)</span> using the sample estimates <span class="math inline">\(\hat{\theta}_i\)</span> calculated from the design matrix <span class="math inline">\(\Bbb{X}\)</span> and response vector <span class="math inline">\(\Bbb{Y}\)</span>.</p>
-<p><span class="math display">\[f_{\hat{\theta}}(x) = \hat{\theta}_0 + \hat{\theta}_1 x_1 + \ldots + \hat{\theta}_p x_p\]</span></p>
-<p>Let’s pause for a moment. At this point, we’re very used to working with the idea of a model parameter. But what exactly does each coefficient <span class="math inline">\(\theta_i\)</span> actually <em>mean</em>? We can think of each <span class="math inline">\(\theta_i\)</span> as a <em>slope</em> of the linear model – if all other variables are held constant, a unit change in <span class="math inline">\(x_i\)</span> will result in a <span class="math inline">\(\theta_i\)</span> change in <span class="math inline">\(f_{\theta}(x)\)</span>. Broadly speaking, a large value of <span class="math inline">\(\theta_i\)</span> means that the feature <span class="math inline">\(x_i\)</span> has a large effect on the response; conversely, a small value of <span class="math inline">\(\theta_i\)</span> means that <span class="math inline">\(x_i\)</span> has little effect on the response. In the extreme case, if the true parameter <span class="math inline">\(\theta_i\)</span> is 0, then the feature <span class="math inline">\(x_i\)</span> has <strong>no effect</strong> on <span class="math inline">\(Y(x)\)</span>.</p>
-<p>If the true parameter <span class="math inline">\(\theta_i\)</span> for a particular feature is 0, this tells us something pretty significant about the world: there is no underlying relationship between <span class="math inline">\(x_i\)</span> and <span class="math inline">\(Y(x)\)</span>! How then, can we test if a parameter is actually 0? As a baseline, we go through our usual process of drawing a sample, using this data to fit a model, and computing an estimate <span class="math inline">\(\hat{\theta}_i\)</span>. However, we need to also consider the fact that if our random sample had come out differently, we may have found a different result for <span class="math inline">\(\hat{\theta}_i\)</span>. To infer if the true parameter <span class="math inline">\(\theta_i\)</span> is 0, we want to draw our conclusion from the distribution of <span class="math inline">\(\hat{\theta}_i\)</span> estimates we could have drawn across all other random samples. This is where <a href="https://inferentialthinking.com/chapters/11/Testing_Hypotheses.html">hypothesis testing</a> comes in handy!</p>
-<p>To test if the true parameter <span class="math inline">\(\theta_i\)</span> is 0, we construct a <strong>hypothesis test</strong> where our null hypothesis states that the true parameter <span class="math inline">\(\theta_i\)</span> is 0, and the alternative hypothesis states that the true parameter <span class="math inline">\(\theta_i\)</span> is <em>not</em> 0. If our p-value is smaller than our cutoff value (usually p=0.05), we reject the null hypothesis.</p>
-</section>
-<section id="bootstrap-resampling-review" class="level2">
-<h2 class="anchored" data-anchor-id="bootstrap-resampling-review">Bootstrap Resampling (Review)</h2>
-<p>To determine properties of the sampling distribution of an estimator like variance, we’d need to have access to the population so that we can consider all possible samples and compute an estimate for each sample.</p>
-<p align="center">
-<img src="images/population_samples.png" alt="y_hat" width="650">
-</p>
-<p>However, we don’t have access to the population; we only have <em>one</em> random sample from the population. How can we consider all possible samples if we only have one?</p>
-<p>The idea of bootstrapping is to treat our random sample as a "population" and resample from it <em>with replacement</em>. Intuitively, a random sample resembles the population, so a random <em>resample</em> also resembles a random sample of the population.</p>
-<div class="callout callout-style-default callout-warning no-icon callout-titled">
-<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-2-contents" aria-controls="callout-2" aria-expanded="true" aria-label="Toggle callout">
-<div class="callout-icon-container">
-<i class="callout-icon no-icon"></i>
-</div>
-<div class="callout-title-container flex-fill">
-Why must we resample <em>with replacement</em>?
-</div>
-<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
-</div>
-<div id="callout-2" class="callout-2-contents callout-collapse collapse show">
-<div class="callout-body-container callout-body">
-<p>Given an original sample of size <span class="math inline">\(n\)</span>, we want a resample that has the same size <span class="math inline">\(n\)</span> as the original. Sampling <em>without</em> replacement will give us the original sample with shuffled rows. Hence, when we calculate summary statistics like the average, our sample <em>without</em> replacement will always have the same average as the original sample, defeating the purpose of a bootstrap.</p>
-</div>
-</div>
-</div>
-<p align="center">
-<img src="images/bootstrap.png" alt="y_hat" width="700">
-</p>
-<p>Bootstrap resampling is a technique for estimating the sampling distribution of an estimator. To execute it, we can follow the pseudocode below:</p>
-<pre><code>collect a random sample of size n (called the bootstrap population)
-
-initiate list of estimates
-
-repeat 10,000 times:
-    resample with replacement from bootstrap population
-    apply estimator f to the resample
-    store in list
-
-list of estimates is the bootstrapped sampling distribution of f</code></pre>
-<p>How well does bootstrapping actually represent our population? The bootstrapped sampling distribution of an estimator does not exactly match the sampling distribution of that estimator, but it is often close. Similarly, the variance of the bootstrapped distribution is often close to the true variance of the estimator. The example below displays the results of different bootstraps from a <em>known</em> population using a sample size of <span class="math inline">\(n=50\)</span>.</p>
-<p align="center">
-<img src="images/bootstrapped_samples.png" alt="y_hat" width="600">
-</p>
-<p>In the real world, we don’t know the population distribution. The center of the boostrapped distribution is the estimator applied to our original sample, so we have no way of recovering the estimator’s true expected value; the center and spread of our bootstrap are <em>approximations</em>. The quality of our bootstrapped distribution also depends on the quality of our original sample; if our original sample was not representative of the population (like Sample 5 in the image above), then the bootstrap is next to useless. In general, bootstrapping works better for <em>large samples</em>, when the population distribution is <em>not heavily skewed</em> (no outliers), and when the estimator is <em>“low variance”</em> (insensitive to extreme values).</p>
-<section id="simple-bootstrap-example" class="level3">
-<h3 class="anchored" data-anchor-id="simple-bootstrap-example">Simple Bootstrap Example</h3>
-<p>TODO</p>
-<!-- #### PurpleAir (chose to skip this section because it's too complex for the amount of pedagogical value it adds)
-To show an example of this hypothesis testing process, we'll work with air quality measurement data. There are 2 common sources of air quality information: Air Quality System (AQS) and [PurpleAir sensors](https://www2.purpleair.com/). AQS is seen as the gold standard because it is high quality, well-calibrated, and publicly available. However, it is very expensive, and the sensors are far apart; reports are also delayed due to extensive calibration.  
-
-On the other hand, PurpleAir (PA) sensors are much cheaper, easier to install, and has denser coverage (measurements are taken every 2 minutes). Unfortunately, its measurements are much less accurate than AQS. 
-
-For this demo, our goal is to use AQS sensor measurements to improve PurpleAir measurements by training a model that adjusts PA measurements based on AQS measurements
-
-$$PA \approx \theta_0 + \theta_1 AQS$$
-
-Using this approximation, we'll invert the model to predict the true air quality from PA measurements
-$$ \text{True Air Quality } \approx -\frac{\theta_0}{\theta_1} + \frac{1}{\theta_1} PA$$
-
-::: {.callout-tip collapse="false"}
-### Inverse Model Derivation 
-Intuitively, AQS measurements are very accurate, so we can treat AQS as the true air quality: 
-$AQS = \text{True Air Quality}$
-
-$$
-\begin{align}
-PA &\approx \theta_0 + \theta_1 AQS \\
-&\approx \theta_0 + \theta_1 \text{True Air Quality} \\
-PA - \theta_0 &\approx + \theta_1 \text{True Air Quality} \\
-\frac{PA - \theta_0}{\theta_1} &\approx \text{True Air Quality} \\
-\text{True Air Quality } &\approx -\frac{\theta_0}{\theta_1} + \frac{1}{\theta_1} PA 
-\end{align}
-$$
-:::
-
-import numpy as np
-import pandas as pd
-import matplotlib
-import matplotlib.pyplot as plt
-import seaborn as sns
-import sklearn.linear_model as lm
-from sklearn.linear_model import LinearRegression
-
-# big font helper
-def adjust_fontsize(size=None):
-    SMALL_SIZE = 8
-    MEDIUM_SIZE = 10
-    BIGGER_SIZE = 12
-    if size != None:
-        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size
-
-    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
-    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
-    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
-    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
-    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
-    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
-    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
-
-plt.style.use('fivethirtyeight')
-sns.set_context("talk")
-sns.set_theme()
-#plt.style.use('default') # revert style to default mpl
-adjust_fontsize(size=20)
-%matplotlib inline
-csv_file = 'data/Full24hrdataset.csv'
-usecols = ['Date', 'ID', 'region', 'PM25FM', 'PM25cf1', 'TempC', 'RH', 'Dewpoint']
-full_df = (pd.read_csv(csv_file, usecols=usecols, parse_dates=['Date'])
-        .dropna())
-full_df.columns = ['date', 'id', 'region', 'pm25aqs', 'pm25pa', 'temp', 'rh', 'dew']
-full_df = full_df.loc[(full_df['pm25aqs'] < 50)]
-
-
-bad_dates = ['2019-08-21', '2019-08-22', '2019-09-24']
-GA = full_df.loc[(full_df['id'] == 'GA1') & (~full_df['date'].isin(bad_dates)) , :]
-AQS, PA = GA[['pm25aqs']], GA['pm25pa']
-AQS.head()
-pd.DataFrame(PA).head()
-``` -->
-</section>
-</section>
-<section id="collinearity" class="level2">
-<h2 class="anchored" data-anchor-id="collinearity">Collinearity</h2>
-<section id="hypothesis-testing-through-bootstrap-snowy-plover-demo" class="level3">
-<h3 class="anchored" data-anchor-id="hypothesis-testing-through-bootstrap-snowy-plover-demo">Hypothesis Testing through Bootstrap: Snowy Plover Demo</h3>
-<p>An equivalent way to execute the hypothesis test described earlier is through <strong>bootstrapping</strong> (this equivalence can be proven through the <a href="https://stats.stackexchange.com/questions/179902/confidence-interval-p-value-duality-vs-frequentist-interpretation-of-cis">duality argument</a>, which is out of scope for this class). We use bootstrapping to compute approximate 95% confidence intervals for each <span class="math inline">\(\theta_i\)</span>. If the interval doesn’t contain 0, we reject the null hypothesis at the p=5% level. Otherwise, the data is consistent with the null, as the true parameter <em>could possibly</em> be 0.</p>
-<p>To show an example of this hypothesis testing process, we’ll work with the <a href="https://www.audubon.org/field-guide/bird/snowy-plover">snowy plover</a> dataset throughout this section. The data are about the eggs and newly-hatched chicks of the Snowy Plover. The data were collected at the Point Reyes National Seashore by a former <a href="https://openlibrary.org/books/OL2038693M/BLSS_the_Berkeley_interactive_statistical_system">student at Berkeley</a>. Here’s a <a href="http://cescos.fau.edu/jay/eps/articles/snowyplover.html">parent bird and some eggs</a>.</p>
-<p align="center">
-<img src="images/plover_eggs.jpg" alt="bvt" width="550">
-</p>
-<p>Note that <code>Egg Length</code> and <code>Egg Breadth</code> (widest diameter) are measured in millimeters, and <code>Egg Weight</code> and <code>Bird Weight</code> are measured in grams; for comparison, a standard paper clip weighs about one gram.</p>
-<div class="cell" data-execution_count="1">
-<details>
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>eggs <span class="op">=</span> pd.read_csv(<span class="st">"data/snowy_plover.csv"</span>)</span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>eggs.head(<span class="dv">5</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-display" data-execution_count="1">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">egg_weight</th>
-<th data-quarto-table-cell-role="th">egg_length</th>
-<th data-quarto-table-cell-role="th">egg_breadth</th>
-<th data-quarto-table-cell-role="th">bird_weight</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>7.4</td>
-<td>28.80</td>
-<td>21.84</td>
-<td>5.2</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>7.7</td>
-<td>29.04</td>
-<td>22.45</td>
-<td>5.4</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>7.9</td>
-<td>29.36</td>
-<td>22.48</td>
-<td>5.6</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>7.5</td>
-<td>30.10</td>
-<td>21.71</td>
-<td>5.3</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>8.3</td>
-<td>30.17</td>
-<td>22.75</td>
-<td>5.9</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<p>Our goal will be to predict the weight of a newborn plover chick, which we assume follows the true relationship <span class="math inline">\(Y = f_{\theta}(x)\)</span> below.</p>
-<p><span class="math display">\[\text{bird\_weight} = \theta_0 + \theta_1 \text{egg\_weight} + \theta_2 \text{egg\_length} + \theta_3 \text{egg\_breadth} + \epsilon\]</span></p>
-<ul>
-<li>For each <span class="math inline">\(i\)</span>, the parameter <span class="math inline">\(\theta_i\)</span> is a fixed number, but it is unobservable. We can only estimate it.</li>
-<li>The random error <span class="math inline">\(\epsilon\)</span> is also unobservable, but it is assumed to have expectation 0 and be independent and identically distributed across eggs.</li>
-</ul>
-<p>Say we wish to determine if the <code>egg_weight</code> impacts the <code>bird_weight</code> of a chick – we want to infer if <span class="math inline">\(\theta_1\)</span> is equal to 0.</p>
-<p>First, we define our hypotheses:</p>
-<ul>
-<li><strong>Null hypothesis</strong>: the true parameter <span class="math inline">\(\theta_1\)</span> is 0; any variation is due to random chance.</li>
-<li><strong>Alternative hypothesis</strong>: the true parameter <span class="math inline">\(\theta_1\)</span> is not 0.</li>
-</ul>
-<p>Next, we use our data to fit a model <span class="math inline">\(\hat{Y} = f_{\hat{\theta}}(x)\)</span> that approximates the relationship above. This gives us the <strong>observed value</strong> of <span class="math inline">\(\hat{\theta}_1\)</span> found from our data.</p>
-<div class="cell" data-execution_count="2">
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> eggs[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> eggs[<span class="st">"bird_weight"</span>]</span>
-<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
-<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>model.fit(X, Y)</span>
-<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a><span class="co"># This gives an array containing the fitted model parameter estimates</span></span>
-<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>thetas <span class="op">=</span> model.coef_</span>
-<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Put the parameter estimates in a nice table for viewing</span></span>
-<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>display(pd.DataFrame(</span>
-<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a>  [model.intercept_] <span class="op">+</span> <span class="bu">list</span>(model.coef_),</span>
-<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a>  columns<span class="op">=</span>[<span class="st">'theta_hat'</span>],</span>
-<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a>  index<span class="op">=</span>[<span class="st">'intercept'</span>, <span class="st">'egg_weight'</span>, <span class="st">'egg_length'</span>, <span class="st">'egg_breadth'</span>]</span>
-<span id="cb3-18"><a href="#cb3-18" aria-hidden="true" tabindex="-1"></a>))</span>
-<span id="cb3-19"><a href="#cb3-19" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-20"><a href="#cb3-20" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"RMSE"</span>, np.mean((Y <span class="op">-</span> model.predict(X)) <span class="op">**</span> <span class="dv">2</span>))</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">theta_hat</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">intercept</td>
-<td>-4.605670</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">egg_weight</td>
-<td>0.431229</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">egg_length</td>
-<td>0.066570</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">egg_breadth</td>
-<td>0.215914</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-<div class="cell-output cell-output-stdout">
-<pre><code>RMSE 0.04547085380275766</code></pre>
-</div>
-</div>
-<p>Our single sample of data gives us the value of <span class="math inline">\(\hat{\theta}_1=0.431\)</span>. To get a sense of how this estimate might vary if we were to draw different random samples, we will use <a href="https://inferentialthinking.com/chapters/13/2/Bootstrap.html?">bootstrapping</a>. To construct a bootstrap sample, we will draw a resample from the collected data that:</p>
-<ul>
-<li>Has the same sample size as the collected data</li>
-<li>Is drawn with replacement (this ensures that we don’t draw the exact same sample every time!)</li>
-</ul>
-<p>We draw a bootstrap sample, use this sample to fit a model, and record the result for <span class="math inline">\(\hat{\theta}_1\)</span> on this bootstrapped sample. We then repeat this process many times to generate a <strong>bootstrapped empirical distribution</strong> of <span class="math inline">\(\hat{\theta}_1\)</span>. This gives us an estimate of what the true distribution of <span class="math inline">\(\hat{\theta}_1\)</span> across all possible samples might look like.</p>
-<div class="cell" data-execution_count="3">
-<div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Set a random seed so you generate the same random sample as staff</span></span>
-<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="co"># In the "real world", we wouldn't do this</span></span>
-<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
-<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
-<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the sample size of each bootstrap sample</span></span>
-<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="bu">len</span>(eggs)</span>
-<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a list to store all the bootstrapped estimates</span></span>
-<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a>estimates <span class="op">=</span> []</span>
-<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Generate a bootstrap resample from `eggs` and find an estimate for theta_1 using this sample. </span></span>
-<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Repeat 10000 times.</span></span>
-<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
-<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a>    <span class="co"># draw a bootstrap sample</span></span>
-<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a>    X_bootstrap <span class="op">=</span> bootstrap_resample[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
-<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap <span class="op">=</span> bootstrap_resample[<span class="st">"bird_weight"</span>]</span>
-<span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a>    <span class="co"># use bootstrapped sample to fit a model</span></span>
-<span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a>    bootstrap_model <span class="op">=</span> LinearRegression()</span>
-<span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a>    bootstrap_model.fit(X_bootstrap, Y_bootstrap)</span>
-<span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a>    bootstrap_thetas <span class="op">=</span> bootstrap_model.coef_</span>
-<span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-25"><a href="#cb5-25" aria-hidden="true" tabindex="-1"></a>    <span class="co"># record the result for theta_1</span></span>
-<span id="cb5-26"><a href="#cb5-26" aria-hidden="true" tabindex="-1"></a>    estimates.append(bootstrap_thetas[<span class="dv">0</span>])</span>
-<span id="cb5-27"><a href="#cb5-27" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb5-28"><a href="#cb5-28" aria-hidden="true" tabindex="-1"></a><span class="co"># calculate the 95% confidence interval </span></span>
-<span id="cb5-29"><a href="#cb5-29" aria-hidden="true" tabindex="-1"></a>lower <span class="op">=</span> np.percentile(estimates, <span class="fl">2.5</span>, axis<span class="op">=</span><span class="dv">0</span>)</span>
-<span id="cb5-30"><a href="#cb5-30" aria-hidden="true" tabindex="-1"></a>upper <span class="op">=</span> np.percentile(estimates, <span class="fl">97.5</span>, axis<span class="op">=</span><span class="dv">0</span>)</span>
-<span id="cb5-31"><a href="#cb5-31" aria-hidden="true" tabindex="-1"></a>conf_interval <span class="op">=</span> (lower, upper)</span>
-<span id="cb5-32"><a href="#cb5-32" aria-hidden="true" tabindex="-1"></a>conf_interval</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="3">
-<pre><code>(-0.258648119568487, 1.103424385420405)</code></pre>
-</div>
-</div>
-<p>Our bootstrapped 95% confidence interval for <span class="math inline">\(\theta_1\)</span> is <span class="math inline">\([-0.259, 1.103]\)</span>. Immediately, we can see that 0 <em>is</em> indeed contained in this interval – this means that we <em>cannot</em> conclude that <span class="math inline">\(\theta_1\)</span> is non-zero! More formally, we fail to reject the null hypothesis (that <span class="math inline">\(\theta_1\)</span> is 0) under a 5% p-value cutoff.</p>
-<p>We can repeat this process to construct 95% confidence intervals for the other parameters of the model.</p>
-<div class="cell" data-execution_count="4">
-<details>
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
-<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>theta_0_estimates <span class="op">=</span> []</span>
-<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a>theta_1_estimates <span class="op">=</span> []</span>
-<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>theta_2_estimates <span class="op">=</span> []</span>
-<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a>theta_3_estimates <span class="op">=</span> []</span>
-<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
-<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a>    X_bootstrap <span class="op">=</span> bootstrap_resample[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
-<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap <span class="op">=</span> bootstrap_resample[<span class="st">"bird_weight"</span>]</span>
-<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb7-14"><a href="#cb7-14" aria-hidden="true" tabindex="-1"></a>    bootstrap_model <span class="op">=</span> LinearRegression()</span>
-<span id="cb7-15"><a href="#cb7-15" aria-hidden="true" tabindex="-1"></a>    bootstrap_model.fit(X_bootstrap, Y_bootstrap)</span>
-<span id="cb7-16"><a href="#cb7-16" aria-hidden="true" tabindex="-1"></a>    bootstrap_theta_0 <span class="op">=</span> bootstrap_model.intercept_</span>
-<span id="cb7-17"><a href="#cb7-17" aria-hidden="true" tabindex="-1"></a>    bootstrap_theta_1, bootstrap_theta_2, bootstrap_theta_3 <span class="op">=</span> bootstrap_model.coef_</span>
-<span id="cb7-18"><a href="#cb7-18" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb7-19"><a href="#cb7-19" aria-hidden="true" tabindex="-1"></a>    theta_0_estimates.append(bootstrap_theta_0)</span>
-<span id="cb7-20"><a href="#cb7-20" aria-hidden="true" tabindex="-1"></a>    theta_1_estimates.append(bootstrap_theta_1)</span>
-<span id="cb7-21"><a href="#cb7-21" aria-hidden="true" tabindex="-1"></a>    theta_2_estimates.append(bootstrap_theta_2)</span>
-<span id="cb7-22"><a href="#cb7-22" aria-hidden="true" tabindex="-1"></a>    theta_3_estimates.append(bootstrap_theta_3)</span>
-<span id="cb7-23"><a href="#cb7-23" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb7-24"><a href="#cb7-24" aria-hidden="true" tabindex="-1"></a>theta_0_lower, theta_0_upper <span class="op">=</span> np.percentile(theta_0_estimates, <span class="fl">2.5</span>), np.percentile(theta_0_estimates, <span class="fl">97.5</span>)</span>
-<span id="cb7-25"><a href="#cb7-25" aria-hidden="true" tabindex="-1"></a>theta_1_lower, theta_1_upper <span class="op">=</span> np.percentile(theta_1_estimates, <span class="fl">2.5</span>), np.percentile(theta_1_estimates, <span class="fl">97.5</span>)</span>
-<span id="cb7-26"><a href="#cb7-26" aria-hidden="true" tabindex="-1"></a>theta_2_lower, theta_2_upper <span class="op">=</span> np.percentile(theta_2_estimates, <span class="fl">2.5</span>), np.percentile(theta_2_estimates, <span class="fl">97.5</span>)</span>
-<span id="cb7-27"><a href="#cb7-27" aria-hidden="true" tabindex="-1"></a>theta_3_lower, theta_3_upper <span class="op">=</span> np.percentile(theta_3_estimates, <span class="fl">2.5</span>), np.percentile(theta_3_estimates, <span class="fl">97.5</span>)</span>
-<span id="cb7-28"><a href="#cb7-28" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb7-29"><a href="#cb7-29" aria-hidden="true" tabindex="-1"></a><span class="co"># Make a nice table to view results</span></span>
-<span id="cb7-30"><a href="#cb7-30" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"lower"</span>:[theta_0_lower, theta_1_lower, theta_2_lower, theta_3_lower], <span class="st">"upper"</span>:[theta_0_upper, <span class="op">\</span></span>
-<span id="cb7-31"><a href="#cb7-31" aria-hidden="true" tabindex="-1"></a>                theta_1_upper, theta_2_upper, theta_3_upper]}, index<span class="op">=</span>[<span class="st">"theta_0"</span>, <span class="st">"theta_1"</span>, <span class="st">"theta_2"</span>, <span class="st">"theta_3"</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-display" data-execution_count="4">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">lower</th>
-<th data-quarto-table-cell-role="th">upper</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">theta_0</td>
-<td>-15.278542</td>
-<td>5.161473</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">theta_1</td>
-<td>-0.258648</td>
-<td>1.103424</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">theta_2</td>
-<td>-0.099138</td>
-<td>0.208557</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">theta_3</td>
-<td>-0.257141</td>
-<td>0.758155</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<p>Something’s off here. Notice that 0 is included in the 95% confidence interval for <em>every</em> parameter of the model. Using the interpretation we outlined above, this would suggest that we can’t say for certain that <em>any</em> of the input variables impact the response variable! This makes it seem like our model can’t make any predictions – and yet, each model we fit in our bootstrap experiment above could very much make predictions of <span class="math inline">\(Y\)</span>.</p>
-<p>How can we explain this result? Think back to how we first interpreted the parameters of a linear model. We treated each <span class="math inline">\(\theta_i\)</span> as a slope, where a unit increase in <span class="math inline">\(x_i\)</span> leads to a <span class="math inline">\(\theta_i\)</span> increase in <span class="math inline">\(Y\)</span>, <strong>if all other variables are held constant</strong>. It turns out that this last assumption is very important. If variables in our model are somehow related to one another, then it might not be possible to have a change in one of them while holding the others constant. This means that our interpretation framework is no longer valid! In the models we fit above, we incorporated <code>egg_length</code>, <code>egg_breadth</code>, and <code>egg_weight</code> as input variables. These variables are very likely related to one another – an egg with large <code>egg_length</code> and <code>egg_breadth</code> will likely be heavy in <code>egg_weight</code>. This means that the model parameters cannot be meaningfully interpreted as slopes.</p>
-<p>To support this conclusion, we can visualize the relationships between our feature variables. Notice the strong positive association between the features.</p>
-<div class="cell" data-execution_count="5">
-<details>
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
-<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>sns.pairplot(eggs[[<span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>, <span class="st">"egg_weight"</span>, <span class="st">'bird_weight'</span>]])<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-display">
-<p><img src="inference_causality_files/figure-html/cell-6-output-1.png" width="946" height="945"></p>
-</div>
-</div>
-<p>This issue is known as <strong>collinearity</strong>, sometimes also called <strong>multicollinearity</strong>. Collinearity occurs when one feature can be predicted fairly accurately by a linear combination of the other features, which happens when one feature is highly correlated with the others.</p>
-<p>Why is collinearity a problem? Its consequences span several aspects of the modeling process:</p>
-<ul>
-<li><strong>Inference</strong>: Slopes can’t be interpreted for an inference task.</li>
-<li><strong>Model Variance</strong>: If features strongly influence one another, even small changes in the sampled data can lead to large changes in the estimated slopes.</li>
-<li><strong>Unique Solution</strong>: If one feature is a linear combination of the other features, the design matrix will not be full rank, and <span class="math inline">\(\mathbb{X}^{\top}\mathbb{X}\)</span> is not invertible. This means that least squares does not have a unique solution. See <a href="https://ds100.org/course-notes/ols/ols.html#bonus-uniqueness-of-the-solution">this section</a> of Course Note 12 for more on this.</li>
-</ul>
-<p>The take-home point is that we need to be careful with what features we select for modeling. If two features likely encode similar information, it is often a good idea to choose only one of them as an input variable.</p>
-</section>
-<section id="a-simpler-model" class="level3">
-<h3 class="anchored" data-anchor-id="a-simpler-model">A Simpler Model</h3>
-<p>Let us now consider a more interpretable model: we instead assume a true relationship using only egg weight:</p>
-<p><span class="math display">\[f_\theta(x) = \theta_0 + \theta_1 \text{egg\_weight} + \epsilon\]</span></p>
-<div class="cell" data-execution_count="6">
-<details>
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
-<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>X_int <span class="op">=</span> eggs[[<span class="st">"egg_weight"</span>]]</span>
-<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>Y_int <span class="op">=</span> eggs[<span class="st">"bird_weight"</span>]</span>
-<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>model_int <span class="op">=</span> LinearRegression()</span>
-<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>model_int.fit(X_int, Y_int)</span>
-<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a><span class="co"># This gives an array containing the fitted model parameter estimates</span></span>
-<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a>thetas_int <span class="op">=</span> model_int.coef_</span>
-<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a><span class="co"># Put the parameter estimates in a nice table for viewing</span></span>
-<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"theta_hat"</span>:[model_int.intercept_, thetas_int[<span class="dv">0</span>]]}, index<span class="op">=</span>[<span class="st">"theta_0"</span>, <span class="st">"theta_1"</span>])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-display" data-execution_count="6">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">theta_hat</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">theta_0</td>
-<td>-0.058272</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">theta_1</td>
-<td>0.718515</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<div class="cell" data-execution_count="7">
-<details>
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
-<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Set a random seed so you generate the same random sample as staff</span></span>
-<span id="cb10-4"><a href="#cb10-4" aria-hidden="true" tabindex="-1"></a><span class="co"># In the "real world", we wouldn't do this</span></span>
-<span id="cb10-5"><a href="#cb10-5" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
-<span id="cb10-6"><a href="#cb10-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-7"><a href="#cb10-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the sample size of each bootstrap sample</span></span>
-<span id="cb10-8"><a href="#cb10-8" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="bu">len</span>(eggs)</span>
-<span id="cb10-9"><a href="#cb10-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-10"><a href="#cb10-10" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a list to store all the bootstrapped estimates</span></span>
-<span id="cb10-11"><a href="#cb10-11" aria-hidden="true" tabindex="-1"></a>estimates_int <span class="op">=</span> []</span>
-<span id="cb10-12"><a href="#cb10-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-13"><a href="#cb10-13" aria-hidden="true" tabindex="-1"></a><span class="co"># Generate a bootstrap resample from `eggs` and find an estimate for theta_1 using this sample. </span></span>
-<span id="cb10-14"><a href="#cb10-14" aria-hidden="true" tabindex="-1"></a><span class="co"># Repeat 10000 times.</span></span>
-<span id="cb10-15"><a href="#cb10-15" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
-<span id="cb10-16"><a href="#cb10-16" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample_int <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb10-17"><a href="#cb10-17" aria-hidden="true" tabindex="-1"></a>    X_bootstrap_int <span class="op">=</span> bootstrap_resample_int[[<span class="st">"egg_weight"</span>]]</span>
-<span id="cb10-18"><a href="#cb10-18" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap_int <span class="op">=</span> bootstrap_resample_int[<span class="st">"bird_weight"</span>]</span>
-<span id="cb10-19"><a href="#cb10-19" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb10-20"><a href="#cb10-20" aria-hidden="true" tabindex="-1"></a>    bootstrap_model_int <span class="op">=</span> LinearRegression()</span>
-<span id="cb10-21"><a href="#cb10-21" aria-hidden="true" tabindex="-1"></a>    bootstrap_model_int.fit(X_bootstrap_int, Y_bootstrap_int)</span>
-<span id="cb10-22"><a href="#cb10-22" aria-hidden="true" tabindex="-1"></a>    bootstrap_thetas_int <span class="op">=</span> bootstrap_model_int.coef_</span>
-<span id="cb10-23"><a href="#cb10-23" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb10-24"><a href="#cb10-24" aria-hidden="true" tabindex="-1"></a>    estimates_int.append(bootstrap_thetas_int[<span class="dv">0</span>])</span>
-<span id="cb10-25"><a href="#cb10-25" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb10-26"><a href="#cb10-26" aria-hidden="true" tabindex="-1"></a>plt.figure(dpi<span class="op">=</span><span class="dv">120</span>)</span>
-<span id="cb10-27"><a href="#cb10-27" aria-hidden="true" tabindex="-1"></a>sns.histplot(estimates_int, stat<span class="op">=</span><span class="st">"density"</span>)</span>
-<span id="cb10-28"><a href="#cb10-28" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="vs">r"$\hat{\theta}_1$"</span>)</span>
-<span id="cb10-29"><a href="#cb10-29" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="vs">r"Bootstrapped estimates $\hat{\theta}_1$ Under the Interpretable Model"</span>)<span class="op">;</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-display">
-<p><img src="inference_causality_files/figure-html/cell-8-output-1.png" class="img-fluid"></p>
-</div>
-</div>
-<p>Notice how the interpretable model performs almost as well as our other model:</p>
-<div class="cell" data-execution_count="8">
-<details>
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> mean_squared_error</span>
-<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>rmse <span class="op">=</span> mean_squared_error(Y, model.predict(X))</span>
-<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>rmse_int <span class="op">=</span> mean_squared_error(Y_int, model_int.predict(X_int))</span>
-<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f'RMSE of Original Model: </span><span class="sc">{</span>rmse<span class="sc">}</span><span class="ss">'</span>)</span>
-<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f'RMSE of Interpretable Model: </span><span class="sc">{</span>rmse_int<span class="sc">}</span><span class="ss">'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-stdout">
-<pre><code>RMSE of Original Model: 0.04547085380275766
-RMSE of Interpretable Model: 0.046493941375556846</code></pre>
-</div>
-</div>
-<p>Yet, the confidence interval for the true parameter <span class="math inline">\(\theta_{1}\)</span> does not contain zero.</p>
-<div class="cell" data-execution_count="9">
-<details>
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb13"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a>lower_int <span class="op">=</span> np.percentile(estimates_int, <span class="fl">2.5</span>)</span>
-<span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a>upper_int <span class="op">=</span> np.percentile(estimates_int, <span class="fl">97.5</span>)</span>
-<span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a>conf_interval_int <span class="op">=</span> (lower_int, upper_int)</span>
-<span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a>conf_interval_int</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-display" data-execution_count="9">
-<pre><code>(0.6029335250209632, 0.8208401738546206)</code></pre>
-</div>
-</div>
-<p>In retrospect, it’s no surprise that the weight of an egg best predicts the weight of a newly-hatched chick.</p>
-<p>A model with highly correlated variables prevents us from interpreting how the variables are related to the prediction.</p>
-</section>
-<section id="reminder-assumptions-matter" class="level3">
-<h3 class="anchored" data-anchor-id="reminder-assumptions-matter">Reminder: Assumptions Matter</h3>
-<p>Keep the following in mind: All inference assumes that the regression model holds.</p>
-<ul>
-<li>If the model doesn’t hold, the inference might not be valid.</li>
-<li>If the <a href="https://inferentialthinking.com/chapters/13/3/Confidence_Intervals.html?highlight=p%20value%20confidence%20interval#care-in-using-the-bootstrap-percentile-method">assumptions of the bootstrap</a> don’t hold…
-<ul>
-<li>Sample size n is large</li>
-<li>Sample is representative of population distribution (drawn i.i.d., unbiased)</li>
-</ul>
-…then the results of the bootstrap might not be valid.</li>
-</ul>
-</section>
-</section>
-<section id="bonus-content" class="level2">
-<h2 class="anchored" data-anchor-id="bonus-content">[Bonus Content]</h2>
-<p>Note: the content in this section is not in scope.</p>
-<!-- ### Correlation vs. Causation
-Let us consider some questions in an arbitrary regression problem. 
-
-What does $\theta_{j}$ mean in our regression?
-
-* Holding other variables fixed, how much should our prediction change with $X_{j}$?
-
-For simple linear regression, this boils down to the correlation coefficient
-
-* Does having more $x$ predict more $y$ (and by how much)? -->
-<section id="prediction-vs-causation" class="level3">
-<h3 class="anchored" data-anchor-id="prediction-vs-causation">Prediction vs Causation</h3>
-<p>The difference between correlation/prediction vs.&nbsp;causation is best illustrated through examples.</p>
-<p>Some questions about <strong>correlation / prediction</strong> include:</p>
-<ul>
-<li>Are homes with granite countertops worth more money?</li>
-<li>Is college GPA higher for students who win a certain scholarship?</li>
-<li>Are breastfed babies less likely to develop asthma?</li>
-<li>Do cancer patients given some aggressive treatment have a higher 5-year survival rate?</li>
-<li>Are people who smoke more likely to get cancer?</li>
-</ul>
-<p>While these may sound like causal questions, they are not! Questions about <strong>causality</strong> are about the <strong>effects</strong> of <strong>interventions</strong> (not just passive observation). For example:</p>
-<ul>
-<li>How much do granite countertops <strong>raise</strong> the value of a house?</li>
-<li>Does getting the scholarship <strong>improve</strong> students’ GPAs?</li>
-<li>Does breastfeeding <strong>protect</strong> babies against asthma?</li>
-<li>Does the treatment <strong>improve</strong> cancer survival?</li>
-<li>Does smoking <strong>cause</strong> cancer?</li>
-</ul>
-<p>Note, however, that regression coefficients are sometimes called “effects”, which can be deceptive!</p>
-<p>When using data alone, <strong>predictive questions</strong> (i.e.&nbsp;are breastfed babies healthier?) can be answered, but <strong>causal questions:</strong> (i.e.&nbsp;does breastfeeding improve babies’ health?) cannot. The reason for this is that there are many possible causes for our predictive question. For example, possible explanations for why breastfed babies are healthier on average include:</p>
-<ul>
-<li><strong>Causal effect:</strong> breastfeeding makes babies healthier</li>
-<li><strong>Reverse causality:</strong> healthier babies more likely to successfully breastfeed</li>
-<li><strong>Common cause:</strong> healthier / richer parents have healthier babies and are more likely to breastfeed</li>
-</ul>
-<p>We cannot tell which explanations are true (or to what extent) just by observing (<span class="math inline">\(x\)</span>,<span class="math inline">\(y\)</span>) pairs.Additionally, causal questions implicitly involve <strong>counterfactuals</strong>, events that didn’t happen. For example, we could ask, <strong>would</strong> the <strong>same</strong> breastfed babies have been less healthy <strong>if</strong> they hadn’t been breastfed? Explanation 1 from above implies they would be, but explanations 2 and 3 do not.</p>
-</section>
-<section id="confounders" class="level3">
-<h3 class="anchored" data-anchor-id="confounders">Confounders</h3>
-<p>Let T represent a treatment (for example, alcohol use), and Y represent an outcome (for example, lung cancer).</p>
-<p><img src="images/confounder.png" alt="confounder" width="600"></p>
-<p>A <strong>confounder</strong> is a variable that affects both T and Y, distorting the correlation between them. Using the example above. Confounders can be a measured covariate (a feature) or an unmeasured variable we don’t know about, and they generally cause problems, as the relationship between T and Y is really affected by data we cannot see. We commonly <em>assume that all confounders are observed</em> (this is also called <strong>ignorability</strong>).</p>
-</section>
-<section id="how-to-perform-causal-inference" class="level3">
-<h3 class="anchored" data-anchor-id="how-to-perform-causal-inference">How to perform causal inference?</h3>
-<p>In a <strong>randomized experiment</strong>, participants are randomly assigned into two groups: treatment and control. A treatment is applied <em>only</em> to the treatment group; we assume ignorability and gather as many measurements as possible so that we can compare them between the control and treatment groups to determine whether or not the treatment is really the cause or just a confounding factor.</p>
-<p><img src="images/experiment.png" alt="experiment" width="600"></p>
-<p>However, often, randomly assigning treatments is impractical or unethical. For example, assigning a treatment of cigarettes to test the effect of smoking on lungs would not only be impractical but also unethical.</p>
-<p>An alternative to bypass this issue is to utilize <strong>observational studies</strong>. This can be done by obtaining two participant groups separated based on some identified treatment variable. Unlike randomized experiments, however, we cannot assume ignorability: the participants could have separated into the two groups based on other covariates! In addition, there could also be unmeasured confounders.</p>
-<p><img src="images/observational.png" alt="observational" width="600"></p>
-<!-- -->
-
-</section>
-</section>
-
-</main>
-<!-- /main column -->
-<script id="quarto-html-after-body" type="application/javascript">
-window.document.addEventListener("DOMContentLoaded", function (event) {
-  const toggleBodyColorMode = (bsSheetEl) => {
-    const mode = bsSheetEl.getAttribute("data-mode");
-    const bodyEl = window.document.querySelector("body");
-    if (mode === "dark") {
-      bodyEl.classList.add("quarto-dark");
-      bodyEl.classList.remove("quarto-light");
-    } else {
-      bodyEl.classList.add("quarto-light");
-      bodyEl.classList.remove("quarto-dark");
-    }
-  }
-  const toggleBodyColorPrimary = () => {
-    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
-    if (bsSheetEl) {
-      toggleBodyColorMode(bsSheetEl);
-    }
-  }
-  toggleBodyColorPrimary();  
-  const icon = "";
-  const anchorJS = new window.AnchorJS();
-  anchorJS.options = {
-    placement: 'right',
-    icon: icon
-  };
-  anchorJS.add('.anchored');
-  const isCodeAnnotation = (el) => {
-    for (const clz of el.classList) {
-      if (clz.startsWith('code-annotation-')) {                     
-        return true;
-      }
-    }
-    return false;
-  }
-  const clipboard = new window.ClipboardJS('.code-copy-button', {
-    text: function(trigger) {
-      const codeEl = trigger.previousElementSibling.cloneNode(true);
-      for (const childEl of codeEl.children) {
-        if (isCodeAnnotation(childEl)) {
-          childEl.remove();
-        }
-      }
-      return codeEl.innerText;
-    }
-  });
-  clipboard.on('success', function(e) {
-    // button target
-    const button = e.trigger;
-    // don't keep focus
-    button.blur();
-    // flash "checked"
-    button.classList.add('code-copy-button-checked');
-    var currentTitle = button.getAttribute("title");
-    button.setAttribute("title", "Copied!");
-    let tooltip;
-    if (window.bootstrap) {
-      button.setAttribute("data-bs-toggle", "tooltip");
-      button.setAttribute("data-bs-placement", "left");
-      button.setAttribute("data-bs-title", "Copied!");
-      tooltip = new bootstrap.Tooltip(button, 
-        { trigger: "manual", 
-          customClass: "code-copy-button-tooltip",
-          offset: [0, -8]});
-      tooltip.show();    
-    }
-    setTimeout(function() {
-      if (tooltip) {
-        tooltip.hide();
-        button.removeAttribute("data-bs-title");
-        button.removeAttribute("data-bs-toggle");
-        button.removeAttribute("data-bs-placement");
-      }
-      button.setAttribute("title", currentTitle);
-      button.classList.remove('code-copy-button-checked');
-    }, 1000);
-    // clear code selection
-    e.clearSelection();
-  });
-  const viewSource = window.document.getElementById('quarto-view-source') ||
-                     window.document.getElementById('quarto-code-tools-source');
-  if (viewSource) {
-    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
-    viewSource.addEventListener("click", function(e) {
-      if (sourceUrl) {
-        // rstudio viewer pane
-        if (/\bcapabilities=\b/.test(window.location)) {
-          window.open(sourceUrl);
-        } else {
-          window.location.href = sourceUrl;
-        }
-      } else {
-        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
-        modal.show();
-      }
-      return false;
-    });
-  }
-  function toggleCodeHandler(show) {
-    return function(e) {
-      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
-      for (let i=0; i<detailsSrc.length; i++) {
-        const details = detailsSrc[i].parentElement;
-        if (show) {
-          details.open = true;
-        } else {
-          details.removeAttribute("open");
-        }
-      }
-      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
-      const fromCls = show ? "hidden" : "unhidden";
-      const toCls = show ? "unhidden" : "hidden";
-      for (let i=0; i<cellCodeDivs.length; i++) {
-        const codeDiv = cellCodeDivs[i];
-        if (codeDiv.classList.contains(fromCls)) {
-          codeDiv.classList.remove(fromCls);
-          codeDiv.classList.add(toCls);
-        } 
-      }
-      return false;
-    }
-  }
-  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
-  if (hideAllCode) {
-    hideAllCode.addEventListener("click", toggleCodeHandler(false));
-  }
-  const showAllCode = window.document.getElementById("quarto-show-all-code");
-  if (showAllCode) {
-    showAllCode.addEventListener("click", toggleCodeHandler(true));
-  }
-  function tippyHover(el, contentFn) {
-    const config = {
-      allowHTML: true,
-      content: contentFn,
-      maxWidth: 500,
-      delay: 100,
-      arrow: false,
-      appendTo: function(el) {
-          return el.parentElement;
-      },
-      interactive: true,
-      interactiveBorder: 10,
-      theme: 'quarto',
-      placement: 'bottom-start'
-    };
-    window.tippy(el, config); 
-  }
-  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
-  for (var i=0; i<noterefs.length; i++) {
-    const ref = noterefs[i];
-    tippyHover(ref, function() {
-      // use id or data attribute instead here
-      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
-      try { href = new URL(href).hash; } catch {}
-      const id = href.replace(/^#\/?/, "");
-      const note = window.document.getElementById(id);
-      return note.innerHTML;
-    });
-  }
-      let selectedAnnoteEl;
-      const selectorForAnnotation = ( cell, annotation) => {
-        let cellAttr = 'data-code-cell="' + cell + '"';
-        let lineAttr = 'data-code-annotation="' +  annotation + '"';
-        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
-        return selector;
-      }
-      const selectCodeLines = (annoteEl) => {
-        const doc = window.document;
-        const targetCell = annoteEl.getAttribute("data-target-cell");
-        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
-        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
-        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
-        const lineIds = lines.map((line) => {
-          return targetCell + "-" + line;
-        })
-        let top = null;
-        let height = null;
-        let parent = null;
-        if (lineIds.length > 0) {
-            //compute the position of the single el (top and bottom and make a div)
-            const el = window.document.getElementById(lineIds[0]);
-            top = el.offsetTop;
-            height = el.offsetHeight;
-            parent = el.parentElement.parentElement;
-          if (lineIds.length > 1) {
-            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
-            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
-            height = bottom - top;
-          }
-          if (top !== null && height !== null && parent !== null) {
-            // cook up a div (if necessary) and position it 
-            let div = window.document.getElementById("code-annotation-line-highlight");
-            if (div === null) {
-              div = window.document.createElement("div");
-              div.setAttribute("id", "code-annotation-line-highlight");
-              div.style.position = 'absolute';
-              parent.appendChild(div);
-            }
-            div.style.top = top - 2 + "px";
-            div.style.height = height + 4 + "px";
-            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
-            if (gutterDiv === null) {
-              gutterDiv = window.document.createElement("div");
-              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
-              gutterDiv.style.position = 'absolute';
-              const codeCell = window.document.getElementById(targetCell);
-              const gutter = codeCell.querySelector('.code-annotation-gutter');
-              gutter.appendChild(gutterDiv);
-            }
-            gutterDiv.style.top = top - 2 + "px";
-            gutterDiv.style.height = height + 4 + "px";
-          }
-          selectedAnnoteEl = annoteEl;
-        }
-      };
-      const unselectCodeLines = () => {
-        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
-        elementsIds.forEach((elId) => {
-          const div = window.document.getElementById(elId);
-          if (div) {
-            div.remove();
-          }
-        });
-        selectedAnnoteEl = undefined;
-      };
-      // Attach click handler to the DT
-      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
-      for (const annoteDlNode of annoteDls) {
-        annoteDlNode.addEventListener('click', (event) => {
-          const clickedEl = event.target;
-          if (clickedEl !== selectedAnnoteEl) {
-            unselectCodeLines();
-            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
-            if (activeEl) {
-              activeEl.classList.remove('code-annotation-active');
-            }
-            selectCodeLines(clickedEl);
-            clickedEl.classList.add('code-annotation-active');
-          } else {
-            // Unselect the line
-            unselectCodeLines();
-            clickedEl.classList.remove('code-annotation-active');
-          }
-        });
-      }
-  const findCites = (el) => {
-    const parentEl = el.parentElement;
-    if (parentEl) {
-      const cites = parentEl.dataset.cites;
-      if (cites) {
-        return {
-          el,
-          cites: cites.split(' ')
-        };
-      } else {
-        return findCites(el.parentElement)
-      }
-    } else {
-      return undefined;
-    }
-  };
-  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
-  for (var i=0; i<bibliorefs.length; i++) {
-    const ref = bibliorefs[i];
-    const citeInfo = findCites(ref);
-    if (citeInfo) {
-      tippyHover(citeInfo.el, function() {
-        var popup = window.document.createElement('div');
-        citeInfo.cites.forEach(function(cite) {
-          var citeDiv = window.document.createElement('div');
-          citeDiv.classList.add('hanging-indent');
-          citeDiv.classList.add('csl-entry');
-          var biblioDiv = window.document.getElementById('ref-' + cite);
-          if (biblioDiv) {
-            citeDiv.innerHTML = biblioDiv.innerHTML;
-          }
-          popup.appendChild(citeDiv);
-        });
-        return popup.innerHTML;
-      });
-    }
-  }
-});
-</script><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
-<div class="sourceCode" id="cb15" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
-<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> Causal Inference and Confounding</span></span>
-<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
-<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
-<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
-<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
-<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: true</span></span>
-<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
-<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
-<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: Causal Inference and the Bootstrap</span></span>
-<span id="cb15-11"><a href="#cb15-11" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
-<span id="cb15-12"><a href="#cb15-12" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
-<span id="cb15-13"><a href="#cb15-13" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
-<span id="cb15-14"><a href="#cb15-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
-<span id="cb15-15"><a href="#cb15-15" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
-<span id="cb15-16"><a href="#cb15-16" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span><span class="co"> python3</span></span>
-<span id="cb15-17"><a href="#cb15-17" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
-<span id="cb15-18"><a href="#cb15-18" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-19"><a href="#cb15-19" aria-hidden="true" tabindex="-1"></a><span class="co">&lt;!-- </span></span>
-<span id="cb15-20"><a href="#cb15-20" aria-hidden="true" tabindex="-1"></a><span class="co">The **bias** of an estimator is how far off it is from the parameter, on average.</span></span>
-<span id="cb15-21"><a href="#cb15-21" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-22"><a href="#cb15-22" aria-hidden="true" tabindex="-1"></a><span class="co">$$\begin{align}\text{Bias}(\hat{\theta}) = \mathbb{E}[\hat{\theta} - \theta] = \mathbb{E}[\hat{\theta}] - \theta\end{align}$$</span></span>
-<span id="cb15-23"><a href="#cb15-23" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-24"><a href="#cb15-24" aria-hidden="true" tabindex="-1"></a><span class="co">For example, the bias of the sample mean as an estimator of the population mean is:</span></span>
-<span id="cb15-25"><a href="#cb15-25" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-26"><a href="#cb15-26" aria-hidden="true" tabindex="-1"></a><span class="co">$$\begin{align}\mathbb{E}[\bar{X}_n - \mu]</span></span>
-<span id="cb15-27"><a href="#cb15-27" aria-hidden="true" tabindex="-1"></a><span class="co">&amp;= \mathbb{E}[\frac{1}{n}\sum_{i=1}^n (X_i)] - \mu \\</span></span>
-<span id="cb15-28"><a href="#cb15-28" aria-hidden="true" tabindex="-1"></a><span class="co">&amp;= \frac{1}{n}\sum_{i=1}^n \mathbb{E}[X_i] - \mu \\</span></span>
-<span id="cb15-29"><a href="#cb15-29" aria-hidden="true" tabindex="-1"></a><span class="co">&amp;= \frac{1}{n} (n\mu) - \mu \\</span></span>
-<span id="cb15-30"><a href="#cb15-30" aria-hidden="true" tabindex="-1"></a><span class="co">&amp;= 0\end{align}$$</span></span>
-<span id="cb15-31"><a href="#cb15-31" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-32"><a href="#cb15-32" aria-hidden="true" tabindex="-1"></a><span class="co">Because its bias is equal to 0, the sample mean is said to be an **unbiased** estimator of the population mean.</span></span>
-<span id="cb15-33"><a href="#cb15-33" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-34"><a href="#cb15-34" aria-hidden="true" tabindex="-1"></a><span class="co">The **variance** of an estimator is a measure of how much the estimator tends to vary from its mean value.</span></span>
-<span id="cb15-35"><a href="#cb15-35" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-36"><a href="#cb15-36" aria-hidden="true" tabindex="-1"></a><span class="co">$$\begin{align}\text{Var}(\hat{\theta}) = \mathbb{E}\left[(\hat{\theta} - \mathbb{E}[\hat{\theta}])^2 \right]\end{align}$$</span></span>
-<span id="cb15-37"><a href="#cb15-37" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-38"><a href="#cb15-38" aria-hidden="true" tabindex="-1"></a><span class="co">The **mean squared error** measures the "goodness" of an estimator by incorporating both the bias and variance. Formally, it is defined as:</span></span>
-<span id="cb15-39"><a href="#cb15-39" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-40"><a href="#cb15-40" aria-hidden="true" tabindex="-1"></a><span class="co">$$\begin{align}\text{MSE}(\hat{\theta}) = \mathbb{E}\left[(\hat{\theta} - \theta)^2</span></span>
-<span id="cb15-41"><a href="#cb15-41" aria-hidden="true" tabindex="-1"></a><span class="co">\right]\end{align}$$ --&gt;</span></span>
-<span id="cb15-42"><a href="#cb15-42" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-43"><a href="#cb15-43" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-44"><a href="#cb15-44" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
-<span id="cb15-45"><a href="#cb15-45" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
-<span id="cb15-46"><a href="#cb15-46" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Construct confidence intervals for hypothesis testing using bootstrapping</span>
-<span id="cb15-47"><a href="#cb15-47" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Understand the assumptions we make and their impact on our regression inference</span>
-<span id="cb15-48"><a href="#cb15-48" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Explore ways to overcome issues of multicollinearity</span>
-<span id="cb15-49"><a href="#cb15-49" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Compare regression correlation and causation</span>
-<span id="cb15-50"><a href="#cb15-50" aria-hidden="true" tabindex="-1"></a>:::</span>
-<span id="cb15-51"><a href="#cb15-51" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-52"><a href="#cb15-52" aria-hidden="true" tabindex="-1"></a>Last time, we introduced the idea of random variables and how they affect the data and model we construct.</span>
-<span id="cb15-53"><a href="#cb15-53" aria-hidden="true" tabindex="-1"></a>We also demonstrated the decomposition of model risk from a fitted model and dived into the bias-variance tradeoff.</span>
-<span id="cb15-54"><a href="#cb15-54" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-55"><a href="#cb15-55" aria-hidden="true" tabindex="-1"></a>In this lecture, we will explore regression inference via hypothesis testing, understand how to use bootstrapping under the right assumptions, and consider the environment of understanding causality in theory and in practice.</span>
-<span id="cb15-56"><a href="#cb15-56" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-57"><a href="#cb15-57" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-58"><a href="#cb15-58" aria-hidden="true" tabindex="-1"></a><span class="fu">## Parameter Inference: Interpreting Regression Coefficients</span></span>
-<span id="cb15-59"><a href="#cb15-59" aria-hidden="true" tabindex="-1"></a>There are two main reasons why do we build models: </span>
-<span id="cb15-60"><a href="#cb15-60" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-61"><a href="#cb15-61" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Prediction**: using our model to make accurate predictions on unseen data, and (2) to understand complex phenomena occurring in the world we live in. </span>
-<span id="cb15-62"><a href="#cb15-62" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Inference**: using our model to draw conclusions about the underlying relationship(s) between our features and response. Its goal is to understand complex phenomena occurring in the world we live in. While training is the process of fiting a model, inference is the *process of making predictions*.</span>
-<span id="cb15-63"><a href="#cb15-63" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-64"><a href="#cb15-64" aria-hidden="true" tabindex="-1"></a>Recall the framework we established in the last lecture. The true underlying relationship between the data points is given by $Y = g(x) + \epsilon$, where $g(x)$ is the *true underlying relationship*, and $\epsilon$ represents randomness. If we assume $g(x)$ is linear, we can express this relationship in terms of the unknown, true model parameters $\theta$.</span>
-<span id="cb15-65"><a href="#cb15-65" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-66"><a href="#cb15-66" aria-hidden="true" tabindex="-1"></a>$$f_{\theta}(x) = g(x) + \epsilon = \theta_0 + \theta_1 x_1 + \ldots + \theta_p x_p + \epsilon$$</span>
-<span id="cb15-67"><a href="#cb15-67" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-68"><a href="#cb15-68" aria-hidden="true" tabindex="-1"></a>Our model attempts to estimate each true population parameter $\theta_i$ using the sample estimates $\hat{\theta}_i$ calculated from the design matrix $\Bbb{X}$ and response vector $\Bbb{Y}$.</span>
-<span id="cb15-69"><a href="#cb15-69" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-70"><a href="#cb15-70" aria-hidden="true" tabindex="-1"></a>$$f_{\hat{\theta}}(x) = \hat{\theta}_0 + \hat{\theta}_1 x_1 + \ldots + \hat{\theta}_p x_p$$</span>
-<span id="cb15-71"><a href="#cb15-71" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-72"><a href="#cb15-72" aria-hidden="true" tabindex="-1"></a>Let's pause for a moment. At this point, we're very used to working with the idea of a model parameter. But what exactly does each coefficient $\theta_i$ actually *mean*? We can think of each $\theta_i$ as a *slope* of the linear model – if all other variables are held constant, a unit change in $x_i$ will result in a $\theta_i$ change in $f_{\theta}(x)$. Broadly speaking, a large value of $\theta_i$ means that the feature $x_i$ has a large effect on the response; conversely, a small value of $\theta_i$ means that $x_i$ has little effect on the response. In the extreme case, if the true parameter $\theta_i$ is 0, then the feature $x_i$ has **no effect** on $Y(x)$. </span>
-<span id="cb15-73"><a href="#cb15-73" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-74"><a href="#cb15-74" aria-hidden="true" tabindex="-1"></a>If the true parameter $\theta_i$ for a particular feature is 0, this tells us something pretty significant about the world: there is no underlying relationship between $x_i$ and $Y(x)$! How then, can we test if a parameter is actually 0? As a baseline, we go through our usual process of drawing a sample, using this data to fit a model, and computing an estimate $\hat{\theta}_i$. However, we need to also consider the fact that if our random sample had come out differently, we may have found a different result for $\hat{\theta}_i$. To infer if the true parameter $\theta_i$ is 0, we want to draw our conclusion from the distribution of $\hat{\theta}_i$ estimates we could have drawn across all other random samples. This is where <span class="co">[</span><span class="ot">hypothesis testing</span><span class="co">](https://inferentialthinking.com/chapters/11/Testing_Hypotheses.html)</span> comes in handy! </span>
-<span id="cb15-75"><a href="#cb15-75" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-76"><a href="#cb15-76" aria-hidden="true" tabindex="-1"></a>To test if the true parameter $\theta_i$ is 0, we construct a **hypothesis test** where our null hypothesis states that the true parameter $\theta_i$ is 0, and the alternative hypothesis states that the true parameter $\theta_i$ is *not* 0. If our p-value is smaller than our cutoff value (usually p=0.05), we reject the null hypothesis. </span>
-<span id="cb15-77"><a href="#cb15-77" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-78"><a href="#cb15-78" aria-hidden="true" tabindex="-1"></a><span class="fu">## Bootstrap Resampling (Review)</span></span>
-<span id="cb15-79"><a href="#cb15-79" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-80"><a href="#cb15-80" aria-hidden="true" tabindex="-1"></a>To determine properties of the sampling distribution of an estimator like variance, we’d need to have access to the population so that we can consider all possible samples and compute an estimate for each sample.</span>
-<span id="cb15-81"><a href="#cb15-81" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-82"><a href="#cb15-82" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;p</span> <span class="er">align</span><span class="ot">=</span><span class="st">"center"</span><span class="kw">&gt;</span></span>
-<span id="cb15-83"><a href="#cb15-83" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;img</span> <span class="er">src</span><span class="ot">=</span><span class="st">"images/population_samples.png"</span> <span class="er">alt</span><span class="ot">=</span><span class="st">'y_hat'</span> <span class="er">width</span><span class="ot">=</span><span class="st">'650'</span><span class="kw">&gt;</span></span>
-<span id="cb15-84"><a href="#cb15-84" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;/p&gt;</span></span>
-<span id="cb15-85"><a href="#cb15-85" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-86"><a href="#cb15-86" aria-hidden="true" tabindex="-1"></a>However, we don’t have access to the population; we only have *one* random sample from the population. How can we consider all possible samples if we only have one?</span>
-<span id="cb15-87"><a href="#cb15-87" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-88"><a href="#cb15-88" aria-hidden="true" tabindex="-1"></a>The idea of bootstrapping is to treat our random sample as a \"population\" and resample from it *with replacement*. Intuitively, a random sample resembles the population, so a random *resample* also resembles a random sample of the population.</span>
-<span id="cb15-89"><a href="#cb15-89" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-90"><a href="#cb15-90" aria-hidden="true" tabindex="-1"></a>::: {.callout-warning collapse=\"true\"}</span>
-<span id="cb15-91"><a href="#cb15-91" aria-hidden="true" tabindex="-1"></a><span class="fu">### Why must we resample *with replacement*?</span></span>
-<span id="cb15-92"><a href="#cb15-92" aria-hidden="true" tabindex="-1"></a>Given an original sample of size $n$, we want a resample that has the same size $n$ as the original. Sampling *without* replacement will give us the original sample with shuffled rows. Hence, when we calculate summary statistics like the average, our sample *without* replacement will always have the same average as the original sample, defeating the purpose of a bootstrap.</span>
-<span id="cb15-93"><a href="#cb15-93" aria-hidden="true" tabindex="-1"></a>:::</span>
-<span id="cb15-94"><a href="#cb15-94" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-95"><a href="#cb15-95" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;p</span> <span class="er">align</span><span class="ot">=</span><span class="st">"center"</span><span class="kw">&gt;</span></span>
-<span id="cb15-96"><a href="#cb15-96" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;img</span> <span class="er">src</span><span class="ot">=</span><span class="st">"images/bootstrap.png"</span> <span class="er">alt</span><span class="ot">=</span><span class="st">'y_hat'</span> <span class="er">width</span><span class="ot">=</span><span class="st">'700'</span><span class="kw">&gt;</span></span>
-<span id="cb15-97"><a href="#cb15-97" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;/p&gt;</span></span>
-<span id="cb15-98"><a href="#cb15-98" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-99"><a href="#cb15-99" aria-hidden="true" tabindex="-1"></a>Bootstrap resampling is a technique for estimating the sampling distribution of an estimator. To execute it, we can follow the pseudocode below:</span>
-<span id="cb15-100"><a href="#cb15-100" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb15-101"><a href="#cb15-101" aria-hidden="true" tabindex="-1"></a><span class="in">collect a random sample of size n (called the bootstrap population)</span></span>
-<span id="cb15-102"><a href="#cb15-102" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-103"><a href="#cb15-103" aria-hidden="true" tabindex="-1"></a><span class="in">initiate list of estimates</span></span>
-<span id="cb15-104"><a href="#cb15-104" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-105"><a href="#cb15-105" aria-hidden="true" tabindex="-1"></a><span class="in">repeat 10,000 times:</span></span>
-<span id="cb15-106"><a href="#cb15-106" aria-hidden="true" tabindex="-1"></a><span class="in">    resample with replacement from bootstrap population</span></span>
-<span id="cb15-107"><a href="#cb15-107" aria-hidden="true" tabindex="-1"></a><span class="in">    apply estimator f to the resample</span></span>
-<span id="cb15-108"><a href="#cb15-108" aria-hidden="true" tabindex="-1"></a><span class="in">    store in list</span></span>
-<span id="cb15-109"><a href="#cb15-109" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-110"><a href="#cb15-110" aria-hidden="true" tabindex="-1"></a><span class="in">list of estimates is the bootstrapped sampling distribution of f</span></span>
-<span id="cb15-111"><a href="#cb15-111" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb15-112"><a href="#cb15-112" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-113"><a href="#cb15-113" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-114"><a href="#cb15-114" aria-hidden="true" tabindex="-1"></a>How well does bootstrapping actually represent our population? The bootstrapped sampling distribution of an estimator does not exactly match the sampling distribution of that estimator, but it is often close. Similarly, the variance of the bootstrapped distribution is often close to the true variance of the estimator. The example below displays the results of different bootstraps from a *known* population using a sample size of $n=50$.</span>
-<span id="cb15-115"><a href="#cb15-115" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-116"><a href="#cb15-116" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;p</span> <span class="er">align</span><span class="ot">=</span><span class="st">"center"</span><span class="kw">&gt;</span></span>
-<span id="cb15-117"><a href="#cb15-117" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;img</span> <span class="er">src</span><span class="ot">=</span><span class="st">"images/bootstrapped_samples.png"</span> <span class="er">alt</span><span class="ot">=</span><span class="st">'y_hat'</span> <span class="er">width</span><span class="ot">=</span><span class="st">'600'</span><span class="kw">&gt;</span></span>
-<span id="cb15-118"><a href="#cb15-118" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;/p&gt;</span></span>
-<span id="cb15-119"><a href="#cb15-119" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-120"><a href="#cb15-120" aria-hidden="true" tabindex="-1"></a>In the real world, we don't know the population distribution. The center of the boostrapped distribution is the estimator applied to our original sample, so we have no way of recovering the estimator's true expected value; the center and spread of our bootstrap are *approximations*. The quality of our bootstrapped distribution also depends on the quality of our original sample; if our original sample was not representative of the population (like Sample 5 in the image above), then the bootstrap is next to useless. In general, bootstrapping works better for *large samples*, when the population distribution is *not heavily skewed* (no outliers), and when the estimator is *“low variance”* (insensitive to extreme values).</span>
-<span id="cb15-121"><a href="#cb15-121" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-122"><a href="#cb15-122" aria-hidden="true" tabindex="-1"></a><span class="fu">### Simple Bootstrap Example</span></span>
-<span id="cb15-123"><a href="#cb15-123" aria-hidden="true" tabindex="-1"></a>TODO</span>
-<span id="cb15-124"><a href="#cb15-124" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-125"><a href="#cb15-125" aria-hidden="true" tabindex="-1"></a><span class="co">&lt;!-- #### PurpleAir (chose to skip this section because it's too complex for the amount of pedagogical value it adds)</span></span>
-<span id="cb15-126"><a href="#cb15-126" aria-hidden="true" tabindex="-1"></a><span class="co">To show an example of this hypothesis testing process, we'll work with air quality measurement data. There are 2 common sources of air quality information: Air Quality System (AQS) and [PurpleAir sensors](https://www2.purpleair.com/). AQS is seen as the gold standard because it is high quality, well-calibrated, and publicly available. However, it is very expensive, and the sensors are far apart; reports are also delayed due to extensive calibration.  </span></span>
-<span id="cb15-127"><a href="#cb15-127" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-128"><a href="#cb15-128" aria-hidden="true" tabindex="-1"></a><span class="co">On the other hand, PurpleAir (PA) sensors are much cheaper, easier to install, and has denser coverage (measurements are taken every 2 minutes). Unfortunately, its measurements are much less accurate than AQS. </span></span>
-<span id="cb15-129"><a href="#cb15-129" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-130"><a href="#cb15-130" aria-hidden="true" tabindex="-1"></a><span class="co">For this demo, our goal is to use AQS sensor measurements to improve PurpleAir measurements by training a model that adjusts PA measurements based on AQS measurements</span></span>
-<span id="cb15-131"><a href="#cb15-131" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-132"><a href="#cb15-132" aria-hidden="true" tabindex="-1"></a><span class="co">$$PA \approx \theta_0 + \theta_1 AQS$$</span></span>
-<span id="cb15-133"><a href="#cb15-133" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-134"><a href="#cb15-134" aria-hidden="true" tabindex="-1"></a><span class="co">Using this approximation, we'll invert the model to predict the true air quality from PA measurements</span></span>
-<span id="cb15-135"><a href="#cb15-135" aria-hidden="true" tabindex="-1"></a><span class="co">$$ \text{True Air Quality } \approx -\frac{\theta_0}{\theta_1} + \frac{1}{\theta_1} PA$$</span></span>
-<span id="cb15-136"><a href="#cb15-136" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-137"><a href="#cb15-137" aria-hidden="true" tabindex="-1"></a><span class="co">::: {.callout-tip collapse="false"}</span></span>
-<span id="cb15-138"><a href="#cb15-138" aria-hidden="true" tabindex="-1"></a><span class="al">###</span><span class="co"> Inverse Model Derivation </span></span>
-<span id="cb15-139"><a href="#cb15-139" aria-hidden="true" tabindex="-1"></a><span class="co">Intuitively, AQS measurements are very accurate, so we can treat AQS as the true air quality: </span></span>
-<span id="cb15-140"><a href="#cb15-140" aria-hidden="true" tabindex="-1"></a><span class="co">$AQS = \text{True Air Quality}$</span></span>
-<span id="cb15-141"><a href="#cb15-141" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-142"><a href="#cb15-142" aria-hidden="true" tabindex="-1"></a><span class="co">$$</span></span>
-<span id="cb15-143"><a href="#cb15-143" aria-hidden="true" tabindex="-1"></a><span class="co">\begin{align}</span></span>
-<span id="cb15-144"><a href="#cb15-144" aria-hidden="true" tabindex="-1"></a><span class="co">PA &amp;\approx \theta_0 + \theta_1 AQS \\</span></span>
-<span id="cb15-145"><a href="#cb15-145" aria-hidden="true" tabindex="-1"></a><span class="co">&amp;\approx \theta_0 + \theta_1 \text{True Air Quality} \\</span></span>
-<span id="cb15-146"><a href="#cb15-146" aria-hidden="true" tabindex="-1"></a><span class="co">PA - \theta_0 &amp;\approx + \theta_1 \text{True Air Quality} \\</span></span>
-<span id="cb15-147"><a href="#cb15-147" aria-hidden="true" tabindex="-1"></a><span class="co">\frac{PA - \theta_0}{\theta_1} &amp;\approx \text{True Air Quality} \\</span></span>
-<span id="cb15-148"><a href="#cb15-148" aria-hidden="true" tabindex="-1"></a><span class="co">\text{True Air Quality } &amp;\approx -\frac{\theta_0}{\theta_1} + \frac{1}{\theta_1} PA </span></span>
-<span id="cb15-149"><a href="#cb15-149" aria-hidden="true" tabindex="-1"></a><span class="co">\end{align}</span></span>
-<span id="cb15-150"><a href="#cb15-150" aria-hidden="true" tabindex="-1"></a><span class="co">$$</span></span>
-<span id="cb15-151"><a href="#cb15-151" aria-hidden="true" tabindex="-1"></a><span class="co">:::</span></span>
-<span id="cb15-152"><a href="#cb15-152" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-155"><a href="#cb15-155" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb15-156"><a href="#cb15-156" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
-<span id="cb15-157"><a href="#cb15-157" aria-hidden="true" tabindex="-1"></a><span class="co">import numpy as np</span></span>
-<span id="cb15-158"><a href="#cb15-158" aria-hidden="true" tabindex="-1"></a><span class="co">import pandas as pd</span></span>
-<span id="cb15-159"><a href="#cb15-159" aria-hidden="true" tabindex="-1"></a><span class="co">import matplotlib</span></span>
-<span id="cb15-160"><a href="#cb15-160" aria-hidden="true" tabindex="-1"></a><span class="co">import matplotlib.pyplot as plt</span></span>
-<span id="cb15-161"><a href="#cb15-161" aria-hidden="true" tabindex="-1"></a><span class="co">import seaborn as sns</span></span>
-<span id="cb15-162"><a href="#cb15-162" aria-hidden="true" tabindex="-1"></a><span class="co">import sklearn.linear_model as lm</span></span>
-<span id="cb15-163"><a href="#cb15-163" aria-hidden="true" tabindex="-1"></a><span class="co">from sklearn.linear_model import LinearRegression</span></span>
-<span id="cb15-164"><a href="#cb15-164" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-165"><a href="#cb15-165" aria-hidden="true" tabindex="-1"></a><span class="co"># big font helper</span></span>
-<span id="cb15-166"><a href="#cb15-166" aria-hidden="true" tabindex="-1"></a><span class="co">def adjust_fontsize(size=None):</span></span>
-<span id="cb15-167"><a href="#cb15-167" aria-hidden="true" tabindex="-1"></a><span class="co">    SMALL_SIZE = 8</span></span>
-<span id="cb15-168"><a href="#cb15-168" aria-hidden="true" tabindex="-1"></a><span class="co">    MEDIUM_SIZE = 10</span></span>
-<span id="cb15-169"><a href="#cb15-169" aria-hidden="true" tabindex="-1"></a><span class="co">    BIGGER_SIZE = 12</span></span>
-<span id="cb15-170"><a href="#cb15-170" aria-hidden="true" tabindex="-1"></a><span class="co">    if size != None:</span></span>
-<span id="cb15-171"><a href="#cb15-171" aria-hidden="true" tabindex="-1"></a><span class="co">        SMALL_SIZE = MEDIUM_SIZE = BIGGER_SIZE = size</span></span>
-<span id="cb15-172"><a href="#cb15-172" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-173"><a href="#cb15-173" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('font', size=SMALL_SIZE)          # controls default text sizes</span></span>
-<span id="cb15-174"><a href="#cb15-174" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title</span></span>
-<span id="cb15-175"><a href="#cb15-175" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels</span></span>
-<span id="cb15-176"><a href="#cb15-176" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels</span></span>
-<span id="cb15-177"><a href="#cb15-177" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels</span></span>
-<span id="cb15-178"><a href="#cb15-178" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize</span></span>
-<span id="cb15-179"><a href="#cb15-179" aria-hidden="true" tabindex="-1"></a><span class="co">    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title</span></span>
-<span id="cb15-180"><a href="#cb15-180" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-181"><a href="#cb15-181" aria-hidden="true" tabindex="-1"></a><span class="co">plt.style.use('fivethirtyeight')</span></span>
-<span id="cb15-182"><a href="#cb15-182" aria-hidden="true" tabindex="-1"></a><span class="co">sns.set_context("talk")</span></span>
-<span id="cb15-183"><a href="#cb15-183" aria-hidden="true" tabindex="-1"></a><span class="co">sns.set_theme()</span></span>
-<span id="cb15-184"><a href="#cb15-184" aria-hidden="true" tabindex="-1"></a><span class="co">#plt.style.use('default') # revert style to default mpl</span></span>
-<span id="cb15-185"><a href="#cb15-185" aria-hidden="true" tabindex="-1"></a><span class="co">adjust_fontsize(size=20)</span></span>
-<span id="cb15-186"><a href="#cb15-186" aria-hidden="true" tabindex="-1"></a><span class="co">%matplotlib inline</span></span>
-<span id="cb15-187"><a href="#cb15-187" aria-hidden="true" tabindex="-1"></a><span class="co">csv_file = 'data/Full24hrdataset.csv'</span></span>
-<span id="cb15-188"><a href="#cb15-188" aria-hidden="true" tabindex="-1"></a><span class="co">usecols = ['Date', 'ID', 'region', 'PM25FM', 'PM25cf1', 'TempC', 'RH', 'Dewpoint']</span></span>
-<span id="cb15-189"><a href="#cb15-189" aria-hidden="true" tabindex="-1"></a><span class="co">full_df = (pd.read_csv(csv_file, usecols=usecols, parse_dates=['Date'])</span></span>
-<span id="cb15-190"><a href="#cb15-190" aria-hidden="true" tabindex="-1"></a><span class="co">        .dropna())</span></span>
-<span id="cb15-191"><a href="#cb15-191" aria-hidden="true" tabindex="-1"></a><span class="co">full_df.columns = ['date', 'id', 'region', 'pm25aqs', 'pm25pa', 'temp', 'rh', 'dew']</span></span>
-<span id="cb15-192"><a href="#cb15-192" aria-hidden="true" tabindex="-1"></a><span class="co">full_df = full_df.loc[(full_df['pm25aqs'] &lt; 50)]</span></span>
-<span id="cb15-193"><a href="#cb15-193" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-194"><a href="#cb15-194" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-195"><a href="#cb15-195" aria-hidden="true" tabindex="-1"></a><span class="co">bad_dates = ['2019-08-21', '2019-08-22', '2019-09-24']</span></span>
-<span id="cb15-196"><a href="#cb15-196" aria-hidden="true" tabindex="-1"></a><span class="co">GA = full_df.loc[(full_df['id'] == 'GA1') &amp; (~full_df['date'].isin(bad_dates)) , :]</span></span>
-<span id="cb15-197"><a href="#cb15-197" aria-hidden="true" tabindex="-1"></a><span class="co">AQS, PA = GA[['pm25aqs']], GA['pm25pa']</span></span>
-<span id="cb15-198"><a href="#cb15-198" aria-hidden="true" tabindex="-1"></a><span class="co">AQS.head()</span></span>
-<span id="cb15-199"><a href="#cb15-199" aria-hidden="true" tabindex="-1"></a><span class="co">pd.DataFrame(PA).head()</span></span>
-<span id="cb15-200"><a href="#cb15-200" aria-hidden="true" tabindex="-1"></a><span class="co">``` --&gt;</span></span>
-<span id="cb15-201"><a href="#cb15-201" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-202"><a href="#cb15-202" aria-hidden="true" tabindex="-1"></a><span class="fu">## Collinearity</span></span>
-<span id="cb15-203"><a href="#cb15-203" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-204"><a href="#cb15-204" aria-hidden="true" tabindex="-1"></a><span class="fu">### Hypothesis Testing through Bootstrap: Snowy Plover Demo</span></span>
-<span id="cb15-205"><a href="#cb15-205" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-206"><a href="#cb15-206" aria-hidden="true" tabindex="-1"></a>An equivalent way to execute the hypothesis test described earlier is through **bootstrapping** (this equivalence can be proven through the <span class="co">[</span><span class="ot">duality argument</span><span class="co">](https://stats.stackexchange.com/questions/179902/confidence-interval-p-value-duality-vs-frequentist-interpretation-of-cis)</span>, which is out of scope for this class). We use bootstrapping to compute approximate 95% confidence intervals for each $\theta_i$. If the interval doesn't contain 0, we reject the null hypothesis at the p=5% level. Otherwise, the data is consistent with the null, as the true parameter *could possibly* be 0.</span>
-<span id="cb15-207"><a href="#cb15-207" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-208"><a href="#cb15-208" aria-hidden="true" tabindex="-1"></a>To show an example of this hypothesis testing process, we'll work with the <span class="co">[</span><span class="ot">snowy plover</span><span class="co">](https://www.audubon.org/field-guide/bird/snowy-plover)</span> dataset throughout this section. The data are about the eggs and newly-hatched chicks of the Snowy Plover. The data were collected at the Point Reyes National Seashore by a former <span class="co">[</span><span class="ot">student at Berkeley</span><span class="co">](https://openlibrary.org/books/OL2038693M/BLSS_the_Berkeley_interactive_statistical_system)</span>. Here's a <span class="co">[</span><span class="ot">parent bird and some eggs</span><span class="co">](http://cescos.fau.edu/jay/eps/articles/snowyplover.html)</span>.</span>
-<span id="cb15-209"><a href="#cb15-209" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-210"><a href="#cb15-210" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;p</span> <span class="er">align</span><span class="ot">=</span><span class="st">"center"</span><span class="kw">&gt;</span></span>
-<span id="cb15-211"><a href="#cb15-211" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;img</span> <span class="er">src</span><span class="ot">=</span><span class="st">"images/plover_eggs.jpg"</span> <span class="er">alt</span><span class="ot">=</span><span class="st">'bvt'</span> <span class="er">width</span><span class="ot">=</span><span class="st">'550'</span><span class="kw">&gt;</span></span>
-<span id="cb15-212"><a href="#cb15-212" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;/p&gt;</span></span>
-<span id="cb15-213"><a href="#cb15-213" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-214"><a href="#cb15-214" aria-hidden="true" tabindex="-1"></a>Note that <span class="in">`Egg Length`</span> and <span class="in">`Egg Breadth`</span> (widest diameter) are measured in millimeters, and <span class="in">`Egg Weight`</span> and <span class="in">`Bird Weight`</span> are measured in grams; for comparison, a standard paper clip weighs about one gram.</span>
-<span id="cb15-215"><a href="#cb15-215" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-218"><a href="#cb15-218" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb15-219"><a href="#cb15-219" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
-<span id="cb15-220"><a href="#cb15-220" aria-hidden="true" tabindex="-1"></a>eggs <span class="op">=</span> pd.read_csv(<span class="st">"data/snowy_plover.csv"</span>)</span>
-<span id="cb15-221"><a href="#cb15-221" aria-hidden="true" tabindex="-1"></a>eggs.head(<span class="dv">5</span>)</span>
-<span id="cb15-222"><a href="#cb15-222" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb15-223"><a href="#cb15-223" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-224"><a href="#cb15-224" aria-hidden="true" tabindex="-1"></a>Our goal will be to predict the weight of a newborn plover chick, which we assume follows the true relationship $Y = f_{\theta}(x)$ below.</span>
-<span id="cb15-225"><a href="#cb15-225" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-226"><a href="#cb15-226" aria-hidden="true" tabindex="-1"></a>$$\text{bird<span class="sc">\_</span>weight} = \theta_0 + \theta_1 \text{egg<span class="sc">\_</span>weight} + \theta_2 \text{egg<span class="sc">\_</span>length} + \theta_3 \text{egg<span class="sc">\_</span>breadth} + \epsilon$$</span>
-<span id="cb15-227"><a href="#cb15-227" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-228"><a href="#cb15-228" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>For each $i$, the parameter $\theta_i$ is a fixed number, but it is unobservable. We can only estimate it.</span>
-<span id="cb15-229"><a href="#cb15-229" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>The random error $\epsilon$ is also unobservable, but it is assumed to have expectation 0 and be independent and identically distributed across eggs.</span>
-<span id="cb15-230"><a href="#cb15-230" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-231"><a href="#cb15-231" aria-hidden="true" tabindex="-1"></a>Say we wish to determine if the <span class="in">`egg_weight`</span> impacts the <span class="in">`bird_weight`</span> of a chick – we want to infer if $\theta_1$ is equal to 0.</span>
-<span id="cb15-232"><a href="#cb15-232" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-233"><a href="#cb15-233" aria-hidden="true" tabindex="-1"></a>First, we define our hypotheses:</span>
-<span id="cb15-234"><a href="#cb15-234" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-235"><a href="#cb15-235" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Null hypothesis**: the true parameter $\theta_1$ is 0; any variation is due to random chance.</span>
-<span id="cb15-236"><a href="#cb15-236" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Alternative hypothesis**: the true parameter $\theta_1$ is not 0.</span>
-<span id="cb15-237"><a href="#cb15-237" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-238"><a href="#cb15-238" aria-hidden="true" tabindex="-1"></a>Next, we use our data to fit a model $\hat{Y} = f_{\hat{\theta}}(x)$ that approximates the relationship above. This gives us the **observed value** of $\hat{\theta}_1$ found from our data.</span>
-<span id="cb15-239"><a href="#cb15-239" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-242"><a href="#cb15-242" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb15-243"><a href="#cb15-243" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
-<span id="cb15-244"><a href="#cb15-244" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
-<span id="cb15-245"><a href="#cb15-245" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
-<span id="cb15-246"><a href="#cb15-246" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-247"><a href="#cb15-247" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> eggs[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
-<span id="cb15-248"><a href="#cb15-248" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> eggs[<span class="st">"bird_weight"</span>]</span>
-<span id="cb15-249"><a href="#cb15-249" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-250"><a href="#cb15-250" aria-hidden="true" tabindex="-1"></a>model <span class="op">=</span> LinearRegression()</span>
-<span id="cb15-251"><a href="#cb15-251" aria-hidden="true" tabindex="-1"></a>model.fit(X, Y)</span>
-<span id="cb15-252"><a href="#cb15-252" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-253"><a href="#cb15-253" aria-hidden="true" tabindex="-1"></a><span class="co"># This gives an array containing the fitted model parameter estimates</span></span>
-<span id="cb15-254"><a href="#cb15-254" aria-hidden="true" tabindex="-1"></a>thetas <span class="op">=</span> model.coef_</span>
-<span id="cb15-255"><a href="#cb15-255" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-256"><a href="#cb15-256" aria-hidden="true" tabindex="-1"></a><span class="co"># Put the parameter estimates in a nice table for viewing</span></span>
-<span id="cb15-257"><a href="#cb15-257" aria-hidden="true" tabindex="-1"></a>display(pd.DataFrame(</span>
-<span id="cb15-258"><a href="#cb15-258" aria-hidden="true" tabindex="-1"></a>  [model.intercept_] <span class="op">+</span> <span class="bu">list</span>(model.coef_),</span>
-<span id="cb15-259"><a href="#cb15-259" aria-hidden="true" tabindex="-1"></a>  columns<span class="op">=</span>[<span class="st">'theta_hat'</span>],</span>
-<span id="cb15-260"><a href="#cb15-260" aria-hidden="true" tabindex="-1"></a>  index<span class="op">=</span>[<span class="st">'intercept'</span>, <span class="st">'egg_weight'</span>, <span class="st">'egg_length'</span>, <span class="st">'egg_breadth'</span>]</span>
-<span id="cb15-261"><a href="#cb15-261" aria-hidden="true" tabindex="-1"></a>))</span>
-<span id="cb15-262"><a href="#cb15-262" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-263"><a href="#cb15-263" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="st">"RMSE"</span>, np.mean((Y <span class="op">-</span> model.predict(X)) <span class="op">**</span> <span class="dv">2</span>))</span>
-<span id="cb15-264"><a href="#cb15-264" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb15-265"><a href="#cb15-265" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-266"><a href="#cb15-266" aria-hidden="true" tabindex="-1"></a>Our single sample of data gives us the value of $\hat{\theta}_1=0.431$. To get a sense of how this estimate might vary if we were to draw different random samples, we will use <span class="co">[</span><span class="ot">bootstrapping</span><span class="co">](https://inferentialthinking.com/chapters/13/2/Bootstrap.html?)</span>. To construct a bootstrap sample, we will draw a resample from the collected data that:</span>
-<span id="cb15-267"><a href="#cb15-267" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-268"><a href="#cb15-268" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Has the same sample size as the collected data</span>
-<span id="cb15-269"><a href="#cb15-269" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Is drawn with replacement (this ensures that we don't draw the exact same sample every time!)</span>
-<span id="cb15-270"><a href="#cb15-270" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-271"><a href="#cb15-271" aria-hidden="true" tabindex="-1"></a>We draw a bootstrap sample, use this sample to fit a model, and record the result for $\hat{\theta}_1$ on this bootstrapped sample. We then repeat this process many times to generate a **bootstrapped empirical distribution** of $\hat{\theta}_1$. This gives us an estimate of what the true distribution of $\hat{\theta}_1$ across all possible samples might look like.</span>
-<span id="cb15-272"><a href="#cb15-272" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-275"><a href="#cb15-275" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb15-276"><a href="#cb15-276" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
-<span id="cb15-277"><a href="#cb15-277" aria-hidden="true" tabindex="-1"></a><span class="co"># Set a random seed so you generate the same random sample as staff</span></span>
-<span id="cb15-278"><a href="#cb15-278" aria-hidden="true" tabindex="-1"></a><span class="co"># In the "real world", we wouldn't do this</span></span>
-<span id="cb15-279"><a href="#cb15-279" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
-<span id="cb15-280"><a href="#cb15-280" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
-<span id="cb15-281"><a href="#cb15-281" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-282"><a href="#cb15-282" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the sample size of each bootstrap sample</span></span>
-<span id="cb15-283"><a href="#cb15-283" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="bu">len</span>(eggs)</span>
-<span id="cb15-284"><a href="#cb15-284" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-285"><a href="#cb15-285" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a list to store all the bootstrapped estimates</span></span>
-<span id="cb15-286"><a href="#cb15-286" aria-hidden="true" tabindex="-1"></a>estimates <span class="op">=</span> []</span>
-<span id="cb15-287"><a href="#cb15-287" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-288"><a href="#cb15-288" aria-hidden="true" tabindex="-1"></a><span class="co"># Generate a bootstrap resample from `eggs` and find an estimate for theta_1 using this sample. </span></span>
-<span id="cb15-289"><a href="#cb15-289" aria-hidden="true" tabindex="-1"></a><span class="co"># Repeat 10000 times.</span></span>
-<span id="cb15-290"><a href="#cb15-290" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
-<span id="cb15-291"><a href="#cb15-291" aria-hidden="true" tabindex="-1"></a>    <span class="co"># draw a bootstrap sample</span></span>
-<span id="cb15-292"><a href="#cb15-292" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb15-293"><a href="#cb15-293" aria-hidden="true" tabindex="-1"></a>    X_bootstrap <span class="op">=</span> bootstrap_resample[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
-<span id="cb15-294"><a href="#cb15-294" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap <span class="op">=</span> bootstrap_resample[<span class="st">"bird_weight"</span>]</span>
-<span id="cb15-295"><a href="#cb15-295" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb15-296"><a href="#cb15-296" aria-hidden="true" tabindex="-1"></a>    <span class="co"># use bootstrapped sample to fit a model</span></span>
-<span id="cb15-297"><a href="#cb15-297" aria-hidden="true" tabindex="-1"></a>    bootstrap_model <span class="op">=</span> LinearRegression()</span>
-<span id="cb15-298"><a href="#cb15-298" aria-hidden="true" tabindex="-1"></a>    bootstrap_model.fit(X_bootstrap, Y_bootstrap)</span>
-<span id="cb15-299"><a href="#cb15-299" aria-hidden="true" tabindex="-1"></a>    bootstrap_thetas <span class="op">=</span> bootstrap_model.coef_</span>
-<span id="cb15-300"><a href="#cb15-300" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb15-301"><a href="#cb15-301" aria-hidden="true" tabindex="-1"></a>    <span class="co"># record the result for theta_1</span></span>
-<span id="cb15-302"><a href="#cb15-302" aria-hidden="true" tabindex="-1"></a>    estimates.append(bootstrap_thetas[<span class="dv">0</span>])</span>
-<span id="cb15-303"><a href="#cb15-303" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb15-304"><a href="#cb15-304" aria-hidden="true" tabindex="-1"></a><span class="co"># calculate the 95% confidence interval </span></span>
-<span id="cb15-305"><a href="#cb15-305" aria-hidden="true" tabindex="-1"></a>lower <span class="op">=</span> np.percentile(estimates, <span class="fl">2.5</span>, axis<span class="op">=</span><span class="dv">0</span>)</span>
-<span id="cb15-306"><a href="#cb15-306" aria-hidden="true" tabindex="-1"></a>upper <span class="op">=</span> np.percentile(estimates, <span class="fl">97.5</span>, axis<span class="op">=</span><span class="dv">0</span>)</span>
-<span id="cb15-307"><a href="#cb15-307" aria-hidden="true" tabindex="-1"></a>conf_interval <span class="op">=</span> (lower, upper)</span>
-<span id="cb15-308"><a href="#cb15-308" aria-hidden="true" tabindex="-1"></a>conf_interval</span>
-<span id="cb15-309"><a href="#cb15-309" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb15-310"><a href="#cb15-310" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-311"><a href="#cb15-311" aria-hidden="true" tabindex="-1"></a>Our bootstrapped 95% confidence interval for $\theta_1$ is $<span class="co">[</span><span class="ot">-0.259, 1.103</span><span class="co">]</span>$. Immediately, we can see that 0 *is* indeed contained in this interval – this means that we *cannot* conclude that $\theta_1$ is non-zero! More formally, we fail to reject the null hypothesis (that $\theta_1$ is 0) under a 5% p-value cutoff. </span>
-<span id="cb15-312"><a href="#cb15-312" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-313"><a href="#cb15-313" aria-hidden="true" tabindex="-1"></a>We can repeat this process to construct 95% confidence intervals for the other parameters of the model.</span>
-<span id="cb15-314"><a href="#cb15-314" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-317"><a href="#cb15-317" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb15-318"><a href="#cb15-318" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
-<span id="cb15-319"><a href="#cb15-319" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-320"><a href="#cb15-320" aria-hidden="true" tabindex="-1"></a>theta_0_estimates <span class="op">=</span> []</span>
-<span id="cb15-321"><a href="#cb15-321" aria-hidden="true" tabindex="-1"></a>theta_1_estimates <span class="op">=</span> []</span>
-<span id="cb15-322"><a href="#cb15-322" aria-hidden="true" tabindex="-1"></a>theta_2_estimates <span class="op">=</span> []</span>
-<span id="cb15-323"><a href="#cb15-323" aria-hidden="true" tabindex="-1"></a>theta_3_estimates <span class="op">=</span> []</span>
-<span id="cb15-324"><a href="#cb15-324" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-325"><a href="#cb15-325" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-326"><a href="#cb15-326" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
-<span id="cb15-327"><a href="#cb15-327" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb15-328"><a href="#cb15-328" aria-hidden="true" tabindex="-1"></a>    X_bootstrap <span class="op">=</span> bootstrap_resample[[<span class="st">"egg_weight"</span>, <span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>]]</span>
-<span id="cb15-329"><a href="#cb15-329" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap <span class="op">=</span> bootstrap_resample[<span class="st">"bird_weight"</span>]</span>
-<span id="cb15-330"><a href="#cb15-330" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb15-331"><a href="#cb15-331" aria-hidden="true" tabindex="-1"></a>    bootstrap_model <span class="op">=</span> LinearRegression()</span>
-<span id="cb15-332"><a href="#cb15-332" aria-hidden="true" tabindex="-1"></a>    bootstrap_model.fit(X_bootstrap, Y_bootstrap)</span>
-<span id="cb15-333"><a href="#cb15-333" aria-hidden="true" tabindex="-1"></a>    bootstrap_theta_0 <span class="op">=</span> bootstrap_model.intercept_</span>
-<span id="cb15-334"><a href="#cb15-334" aria-hidden="true" tabindex="-1"></a>    bootstrap_theta_1, bootstrap_theta_2, bootstrap_theta_3 <span class="op">=</span> bootstrap_model.coef_</span>
-<span id="cb15-335"><a href="#cb15-335" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb15-336"><a href="#cb15-336" aria-hidden="true" tabindex="-1"></a>    theta_0_estimates.append(bootstrap_theta_0)</span>
-<span id="cb15-337"><a href="#cb15-337" aria-hidden="true" tabindex="-1"></a>    theta_1_estimates.append(bootstrap_theta_1)</span>
-<span id="cb15-338"><a href="#cb15-338" aria-hidden="true" tabindex="-1"></a>    theta_2_estimates.append(bootstrap_theta_2)</span>
-<span id="cb15-339"><a href="#cb15-339" aria-hidden="true" tabindex="-1"></a>    theta_3_estimates.append(bootstrap_theta_3)</span>
-<span id="cb15-340"><a href="#cb15-340" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb15-341"><a href="#cb15-341" aria-hidden="true" tabindex="-1"></a>theta_0_lower, theta_0_upper <span class="op">=</span> np.percentile(theta_0_estimates, <span class="fl">2.5</span>), np.percentile(theta_0_estimates, <span class="fl">97.5</span>)</span>
-<span id="cb15-342"><a href="#cb15-342" aria-hidden="true" tabindex="-1"></a>theta_1_lower, theta_1_upper <span class="op">=</span> np.percentile(theta_1_estimates, <span class="fl">2.5</span>), np.percentile(theta_1_estimates, <span class="fl">97.5</span>)</span>
-<span id="cb15-343"><a href="#cb15-343" aria-hidden="true" tabindex="-1"></a>theta_2_lower, theta_2_upper <span class="op">=</span> np.percentile(theta_2_estimates, <span class="fl">2.5</span>), np.percentile(theta_2_estimates, <span class="fl">97.5</span>)</span>
-<span id="cb15-344"><a href="#cb15-344" aria-hidden="true" tabindex="-1"></a>theta_3_lower, theta_3_upper <span class="op">=</span> np.percentile(theta_3_estimates, <span class="fl">2.5</span>), np.percentile(theta_3_estimates, <span class="fl">97.5</span>)</span>
-<span id="cb15-345"><a href="#cb15-345" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-346"><a href="#cb15-346" aria-hidden="true" tabindex="-1"></a><span class="co"># Make a nice table to view results</span></span>
-<span id="cb15-347"><a href="#cb15-347" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"lower"</span>:[theta_0_lower, theta_1_lower, theta_2_lower, theta_3_lower], <span class="st">"upper"</span>:[theta_0_upper, <span class="op">\</span></span>
-<span id="cb15-348"><a href="#cb15-348" aria-hidden="true" tabindex="-1"></a>                theta_1_upper, theta_2_upper, theta_3_upper]}, index<span class="op">=</span>[<span class="st">"theta_0"</span>, <span class="st">"theta_1"</span>, <span class="st">"theta_2"</span>, <span class="st">"theta_3"</span>])</span>
-<span id="cb15-349"><a href="#cb15-349" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb15-350"><a href="#cb15-350" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-351"><a href="#cb15-351" aria-hidden="true" tabindex="-1"></a>Something's off here. Notice that 0 is included in the 95% confidence interval for *every* parameter of the model. Using the interpretation we outlined above, this would suggest that we can't say for certain that *any* of the input variables impact the response variable! This makes it seem like our model can't make any predictions – and yet, each model we fit in our bootstrap experiment above could very much make predictions of $Y$. </span>
-<span id="cb15-352"><a href="#cb15-352" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-353"><a href="#cb15-353" aria-hidden="true" tabindex="-1"></a>How can we explain this result? Think back to how we first interpreted the parameters of a linear model. We treated each $\theta_i$ as a slope, where a unit increase in $x_i$ leads to a $\theta_i$ increase in $Y$, **if all other variables are held constant**. It turns out that this last assumption is very important. If variables in our model are somehow related to one another, then it might not be possible to have a change in one of them while holding the others constant. This means that our interpretation framework is no longer valid! In the models we fit above, we incorporated <span class="in">`egg_length`</span>, <span class="in">`egg_breadth`</span>, and <span class="in">`egg_weight`</span> as input variables. These variables are very likely related to one another – an egg with large <span class="in">`egg_length`</span> and <span class="in">`egg_breadth`</span> will likely be heavy in <span class="in">`egg_weight`</span>. This means that the model parameters cannot be meaningfully interpreted as slopes. </span>
-<span id="cb15-354"><a href="#cb15-354" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-355"><a href="#cb15-355" aria-hidden="true" tabindex="-1"></a>To support this conclusion, we can visualize the relationships between our feature variables. Notice the strong positive association between the features.</span>
-<span id="cb15-356"><a href="#cb15-356" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-359"><a href="#cb15-359" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb15-360"><a href="#cb15-360" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
-<span id="cb15-361"><a href="#cb15-361" aria-hidden="true" tabindex="-1"></a>sns.pairplot(eggs[[<span class="st">"egg_length"</span>, <span class="st">"egg_breadth"</span>, <span class="st">"egg_weight"</span>, <span class="st">'bird_weight'</span>]])<span class="op">;</span></span>
-<span id="cb15-362"><a href="#cb15-362" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb15-363"><a href="#cb15-363" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-364"><a href="#cb15-364" aria-hidden="true" tabindex="-1"></a>This issue is known as **collinearity**, sometimes also called **multicollinearity**. Collinearity occurs when one feature can be predicted fairly accurately by a linear combination of the other features, which happens when one feature is highly correlated with the others. </span>
-<span id="cb15-365"><a href="#cb15-365" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-366"><a href="#cb15-366" aria-hidden="true" tabindex="-1"></a>Why is collinearity a problem? Its consequences span several aspects of the modeling process:</span>
-<span id="cb15-367"><a href="#cb15-367" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-368"><a href="#cb15-368" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Inference**: Slopes can't be interpreted for an inference task.</span>
-<span id="cb15-369"><a href="#cb15-369" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Model Variance**: If features strongly influence one another, even small changes in the sampled data can lead to large changes in the estimated slopes.</span>
-<span id="cb15-370"><a href="#cb15-370" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Unique Solution**: If one feature is a linear combination of the other features, the design matrix will not be full rank, and $\mathbb{X}^{\top}\mathbb{X}$ is not invertible. This means that least squares does not have a unique solution. See <span class="co">[</span><span class="ot">this section</span><span class="co">](https://ds100.org/course-notes/ols/ols.html#bonus-uniqueness-of-the-solution)</span> of Course Note 12 for more on this.</span>
-<span id="cb15-371"><a href="#cb15-371" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-372"><a href="#cb15-372" aria-hidden="true" tabindex="-1"></a>The take-home point is that we need to be careful with what features we select for modeling. If two features likely encode similar information, it is often a good idea to choose only one of them as an input variable.</span>
-<span id="cb15-373"><a href="#cb15-373" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-374"><a href="#cb15-374" aria-hidden="true" tabindex="-1"></a><span class="fu">### A Simpler Model</span></span>
-<span id="cb15-375"><a href="#cb15-375" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-376"><a href="#cb15-376" aria-hidden="true" tabindex="-1"></a>Let us now consider a more interpretable model: we instead assume a true relationship using only egg weight:</span>
-<span id="cb15-377"><a href="#cb15-377" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-378"><a href="#cb15-378" aria-hidden="true" tabindex="-1"></a>$$f_\theta(x) = \theta_0 + \theta_1 \text{egg<span class="sc">\_</span>weight} + \epsilon$$</span>
-<span id="cb15-379"><a href="#cb15-379" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-382"><a href="#cb15-382" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb15-383"><a href="#cb15-383" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
-<span id="cb15-384"><a href="#cb15-384" aria-hidden="true" tabindex="-1"></a>X_int <span class="op">=</span> eggs[[<span class="st">"egg_weight"</span>]]</span>
-<span id="cb15-385"><a href="#cb15-385" aria-hidden="true" tabindex="-1"></a>Y_int <span class="op">=</span> eggs[<span class="st">"bird_weight"</span>]</span>
-<span id="cb15-386"><a href="#cb15-386" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-387"><a href="#cb15-387" aria-hidden="true" tabindex="-1"></a>model_int <span class="op">=</span> LinearRegression()</span>
-<span id="cb15-388"><a href="#cb15-388" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-389"><a href="#cb15-389" aria-hidden="true" tabindex="-1"></a>model_int.fit(X_int, Y_int)</span>
-<span id="cb15-390"><a href="#cb15-390" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-391"><a href="#cb15-391" aria-hidden="true" tabindex="-1"></a><span class="co"># This gives an array containing the fitted model parameter estimates</span></span>
-<span id="cb15-392"><a href="#cb15-392" aria-hidden="true" tabindex="-1"></a>thetas_int <span class="op">=</span> model_int.coef_</span>
-<span id="cb15-393"><a href="#cb15-393" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-394"><a href="#cb15-394" aria-hidden="true" tabindex="-1"></a><span class="co"># Put the parameter estimates in a nice table for viewing</span></span>
-<span id="cb15-395"><a href="#cb15-395" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"theta_hat"</span>:[model_int.intercept_, thetas_int[<span class="dv">0</span>]]}, index<span class="op">=</span>[<span class="st">"theta_0"</span>, <span class="st">"theta_1"</span>])</span>
-<span id="cb15-396"><a href="#cb15-396" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb15-397"><a href="#cb15-397" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-400"><a href="#cb15-400" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb15-401"><a href="#cb15-401" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
-<span id="cb15-402"><a href="#cb15-402" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
-<span id="cb15-403"><a href="#cb15-403" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-404"><a href="#cb15-404" aria-hidden="true" tabindex="-1"></a><span class="co"># Set a random seed so you generate the same random sample as staff</span></span>
-<span id="cb15-405"><a href="#cb15-405" aria-hidden="true" tabindex="-1"></a><span class="co"># In the "real world", we wouldn't do this</span></span>
-<span id="cb15-406"><a href="#cb15-406" aria-hidden="true" tabindex="-1"></a>np.random.seed(<span class="dv">1337</span>)</span>
-<span id="cb15-407"><a href="#cb15-407" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-408"><a href="#cb15-408" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the sample size of each bootstrap sample</span></span>
-<span id="cb15-409"><a href="#cb15-409" aria-hidden="true" tabindex="-1"></a>n <span class="op">=</span> <span class="bu">len</span>(eggs)</span>
-<span id="cb15-410"><a href="#cb15-410" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-411"><a href="#cb15-411" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a list to store all the bootstrapped estimates</span></span>
-<span id="cb15-412"><a href="#cb15-412" aria-hidden="true" tabindex="-1"></a>estimates_int <span class="op">=</span> []</span>
-<span id="cb15-413"><a href="#cb15-413" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-414"><a href="#cb15-414" aria-hidden="true" tabindex="-1"></a><span class="co"># Generate a bootstrap resample from `eggs` and find an estimate for theta_1 using this sample. </span></span>
-<span id="cb15-415"><a href="#cb15-415" aria-hidden="true" tabindex="-1"></a><span class="co"># Repeat 10000 times.</span></span>
-<span id="cb15-416"><a href="#cb15-416" aria-hidden="true" tabindex="-1"></a><span class="cf">for</span> i <span class="kw">in</span> <span class="bu">range</span>(<span class="dv">10000</span>):</span>
-<span id="cb15-417"><a href="#cb15-417" aria-hidden="true" tabindex="-1"></a>    bootstrap_resample_int <span class="op">=</span> eggs.sample(n, replace<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb15-418"><a href="#cb15-418" aria-hidden="true" tabindex="-1"></a>    X_bootstrap_int <span class="op">=</span> bootstrap_resample_int[[<span class="st">"egg_weight"</span>]]</span>
-<span id="cb15-419"><a href="#cb15-419" aria-hidden="true" tabindex="-1"></a>    Y_bootstrap_int <span class="op">=</span> bootstrap_resample_int[<span class="st">"bird_weight"</span>]</span>
-<span id="cb15-420"><a href="#cb15-420" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb15-421"><a href="#cb15-421" aria-hidden="true" tabindex="-1"></a>    bootstrap_model_int <span class="op">=</span> LinearRegression()</span>
-<span id="cb15-422"><a href="#cb15-422" aria-hidden="true" tabindex="-1"></a>    bootstrap_model_int.fit(X_bootstrap_int, Y_bootstrap_int)</span>
-<span id="cb15-423"><a href="#cb15-423" aria-hidden="true" tabindex="-1"></a>    bootstrap_thetas_int <span class="op">=</span> bootstrap_model_int.coef_</span>
-<span id="cb15-424"><a href="#cb15-424" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb15-425"><a href="#cb15-425" aria-hidden="true" tabindex="-1"></a>    estimates_int.append(bootstrap_thetas_int[<span class="dv">0</span>])</span>
-<span id="cb15-426"><a href="#cb15-426" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-427"><a href="#cb15-427" aria-hidden="true" tabindex="-1"></a>plt.figure(dpi<span class="op">=</span><span class="dv">120</span>)</span>
-<span id="cb15-428"><a href="#cb15-428" aria-hidden="true" tabindex="-1"></a>sns.histplot(estimates_int, stat<span class="op">=</span><span class="st">"density"</span>)</span>
-<span id="cb15-429"><a href="#cb15-429" aria-hidden="true" tabindex="-1"></a>plt.xlabel(<span class="vs">r"$\hat{\theta}_1$"</span>)</span>
-<span id="cb15-430"><a href="#cb15-430" aria-hidden="true" tabindex="-1"></a>plt.title(<span class="vs">r"Bootstrapped estimates $\hat{\theta}_1$ Under the Interpretable Model"</span>)<span class="op">;</span></span>
-<span id="cb15-431"><a href="#cb15-431" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb15-432"><a href="#cb15-432" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-433"><a href="#cb15-433" aria-hidden="true" tabindex="-1"></a>Notice how the interpretable model performs almost as well as our other model:</span>
-<span id="cb15-434"><a href="#cb15-434" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-437"><a href="#cb15-437" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb15-438"><a href="#cb15-438" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.metrics <span class="im">import</span> mean_squared_error</span>
-<span id="cb15-439"><a href="#cb15-439" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-440"><a href="#cb15-440" aria-hidden="true" tabindex="-1"></a>rmse <span class="op">=</span> mean_squared_error(Y, model.predict(X))</span>
-<span id="cb15-441"><a href="#cb15-441" aria-hidden="true" tabindex="-1"></a>rmse_int <span class="op">=</span> mean_squared_error(Y_int, model_int.predict(X_int))</span>
-<span id="cb15-442"><a href="#cb15-442" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f'RMSE of Original Model: </span><span class="sc">{</span>rmse<span class="sc">}</span><span class="ss">'</span>)</span>
-<span id="cb15-443"><a href="#cb15-443" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f'RMSE of Interpretable Model: </span><span class="sc">{</span>rmse_int<span class="sc">}</span><span class="ss">'</span>)</span>
-<span id="cb15-444"><a href="#cb15-444" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb15-445"><a href="#cb15-445" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-446"><a href="#cb15-446" aria-hidden="true" tabindex="-1"></a>Yet, the confidence interval for the true parameter $\theta_{1}$ does not contain zero.</span>
-<span id="cb15-447"><a href="#cb15-447" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-450"><a href="#cb15-450" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb15-451"><a href="#cb15-451" aria-hidden="true" tabindex="-1"></a>lower_int <span class="op">=</span> np.percentile(estimates_int, <span class="fl">2.5</span>)</span>
-<span id="cb15-452"><a href="#cb15-452" aria-hidden="true" tabindex="-1"></a>upper_int <span class="op">=</span> np.percentile(estimates_int, <span class="fl">97.5</span>)</span>
-<span id="cb15-453"><a href="#cb15-453" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-454"><a href="#cb15-454" aria-hidden="true" tabindex="-1"></a>conf_interval_int <span class="op">=</span> (lower_int, upper_int)</span>
-<span id="cb15-455"><a href="#cb15-455" aria-hidden="true" tabindex="-1"></a>conf_interval_int</span>
-<span id="cb15-456"><a href="#cb15-456" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb15-457"><a href="#cb15-457" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-458"><a href="#cb15-458" aria-hidden="true" tabindex="-1"></a>In retrospect, it’s no surprise that the weight of an egg best predicts the weight of a newly-hatched chick.</span>
-<span id="cb15-459"><a href="#cb15-459" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-460"><a href="#cb15-460" aria-hidden="true" tabindex="-1"></a>A model with highly correlated variables prevents us from interpreting how the variables are related to the prediction.</span>
-<span id="cb15-461"><a href="#cb15-461" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-462"><a href="#cb15-462" aria-hidden="true" tabindex="-1"></a><span class="fu">### Reminder: Assumptions Matter</span></span>
-<span id="cb15-463"><a href="#cb15-463" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-464"><a href="#cb15-464" aria-hidden="true" tabindex="-1"></a>Keep the following in mind:</span>
-<span id="cb15-465"><a href="#cb15-465" aria-hidden="true" tabindex="-1"></a>All inference assumes that the regression model holds.</span>
-<span id="cb15-466"><a href="#cb15-466" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-467"><a href="#cb15-467" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>If the model doesn’t hold, the inference might not be valid.</span>
-<span id="cb15-468"><a href="#cb15-468" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>If the <span class="co">[</span><span class="ot">assumptions of the bootstrap</span><span class="co">](https://inferentialthinking.com/chapters/13/3/Confidence_Intervals.html?highlight=p%20value%20confidence%20interval#care-in-using-the-bootstrap-percentile-method)</span> don’t hold…</span>
-<span id="cb15-469"><a href="#cb15-469" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Sample size n is large</span>
-<span id="cb15-470"><a href="#cb15-470" aria-hidden="true" tabindex="-1"></a><span class="ss">    * </span>Sample is representative of population distribution (drawn i.i.d., unbiased)</span>
-<span id="cb15-471"><a href="#cb15-471" aria-hidden="true" tabindex="-1"></a>    </span>
-<span id="cb15-472"><a href="#cb15-472" aria-hidden="true" tabindex="-1"></a>    …then the results of the bootstrap might not be valid.</span>
-<span id="cb15-473"><a href="#cb15-473" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-474"><a href="#cb15-474" aria-hidden="true" tabindex="-1"></a><span class="fu">## [Bonus Content] </span></span>
-<span id="cb15-475"><a href="#cb15-475" aria-hidden="true" tabindex="-1"></a>Note: the content in this section is not in scope.</span>
-<span id="cb15-476"><a href="#cb15-476" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-477"><a href="#cb15-477" aria-hidden="true" tabindex="-1"></a><span class="co">&lt;!-- </span><span class="al">###</span><span class="co"> Correlation vs. Causation</span></span>
-<span id="cb15-478"><a href="#cb15-478" aria-hidden="true" tabindex="-1"></a><span class="co">Let us consider some questions in an arbitrary regression problem. </span></span>
-<span id="cb15-479"><a href="#cb15-479" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-480"><a href="#cb15-480" aria-hidden="true" tabindex="-1"></a><span class="co">What does $\theta_{j}$ mean in our regression?</span></span>
-<span id="cb15-481"><a href="#cb15-481" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-482"><a href="#cb15-482" aria-hidden="true" tabindex="-1"></a><span class="co">* Holding other variables fixed, how much should our prediction change with $X_{j}$?</span></span>
-<span id="cb15-483"><a href="#cb15-483" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-484"><a href="#cb15-484" aria-hidden="true" tabindex="-1"></a><span class="co">For simple linear regression, this boils down to the correlation coefficient</span></span>
-<span id="cb15-485"><a href="#cb15-485" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-486"><a href="#cb15-486" aria-hidden="true" tabindex="-1"></a><span class="co">* Does having more $x$ predict more $y$ (and by how much)? --&gt;</span></span>
-<span id="cb15-487"><a href="#cb15-487" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-488"><a href="#cb15-488" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-489"><a href="#cb15-489" aria-hidden="true" tabindex="-1"></a><span class="fu">### Prediction vs Causation</span></span>
-<span id="cb15-490"><a href="#cb15-490" aria-hidden="true" tabindex="-1"></a>The difference between correlation/prediction vs. causation is best illustrated through examples. </span>
-<span id="cb15-491"><a href="#cb15-491" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-492"><a href="#cb15-492" aria-hidden="true" tabindex="-1"></a>Some questions about **correlation / prediction** include:</span>
-<span id="cb15-493"><a href="#cb15-493" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-494"><a href="#cb15-494" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Are homes with granite countertops worth more money?</span>
-<span id="cb15-495"><a href="#cb15-495" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Is college GPA higher for students who win a certain scholarship?</span>
-<span id="cb15-496"><a href="#cb15-496" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Are breastfed babies less likely to develop asthma?</span>
-<span id="cb15-497"><a href="#cb15-497" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Do cancer patients given some aggressive treatment have a higher 5-year survival rate?</span>
-<span id="cb15-498"><a href="#cb15-498" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Are people who smoke more likely to get cancer? </span>
-<span id="cb15-499"><a href="#cb15-499" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-500"><a href="#cb15-500" aria-hidden="true" tabindex="-1"></a>While these may sound like causal questions, they are not! Questions about **causality** are about the **effects** of **interventions** (not just passive observation). For example:</span>
-<span id="cb15-501"><a href="#cb15-501" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-502"><a href="#cb15-502" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>How much do granite countertops **raise** the value of a house?</span>
-<span id="cb15-503"><a href="#cb15-503" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Does getting the scholarship **improve** students’ GPAs?</span>
-<span id="cb15-504"><a href="#cb15-504" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Does breastfeeding **protect** babies against asthma?</span>
-<span id="cb15-505"><a href="#cb15-505" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Does the treatment **improve** cancer survival?</span>
-<span id="cb15-506"><a href="#cb15-506" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Does smoking **cause** cancer?</span>
-<span id="cb15-507"><a href="#cb15-507" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-508"><a href="#cb15-508" aria-hidden="true" tabindex="-1"></a>Note, however, that regression coefficients are sometimes called “effects”, which can be deceptive!</span>
-<span id="cb15-509"><a href="#cb15-509" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-510"><a href="#cb15-510" aria-hidden="true" tabindex="-1"></a>When using data alone, **predictive questions** (i.e. are breastfed babies healthier?) can be answered, but **causal questions:** (i.e. does breastfeeding improve babies’ health?) cannot. The reason for this is that there are many possible causes for our predictive question. For example, possible explanations for why breastfed babies are healthier on average include:</span>
-<span id="cb15-511"><a href="#cb15-511" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-512"><a href="#cb15-512" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Causal effect:** breastfeeding makes babies healthier</span>
-<span id="cb15-513"><a href="#cb15-513" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Reverse causality:** healthier babies more likely to successfully breastfeed</span>
-<span id="cb15-514"><a href="#cb15-514" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>**Common cause:** healthier / richer parents have healthier babies and are more likely to breastfeed</span>
-<span id="cb15-515"><a href="#cb15-515" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-516"><a href="#cb15-516" aria-hidden="true" tabindex="-1"></a>We cannot tell which explanations are true (or to what extent) just by observing ($x$,$y$) pairs.Additionally, causal questions implicitly involve **counterfactuals**, events that didn't happen. For example, we could ask, **would** the **same** breastfed babies have been less healthy **if** they hadn’t been breastfed? Explanation 1 from above implies they would be, but explanations 2 and 3 do not. </span>
-<span id="cb15-517"><a href="#cb15-517" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-518"><a href="#cb15-518" aria-hidden="true" tabindex="-1"></a><span class="fu">### Confounders</span></span>
-<span id="cb15-519"><a href="#cb15-519" aria-hidden="true" tabindex="-1"></a>Let T represent a treatment (for example, alcohol use), and Y represent an outcome (for example, lung cancer).</span>
-<span id="cb15-520"><a href="#cb15-520" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-521"><a href="#cb15-521" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;img</span> <span class="er">src</span><span class="ot">=</span><span class="st">"images/confounder.png"</span> <span class="er">alt</span><span class="ot">=</span><span class="st">'confounder'</span> <span class="er">width</span><span class="ot">=</span><span class="st">'600'</span><span class="kw">&gt;</span></span>
-<span id="cb15-522"><a href="#cb15-522" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-523"><a href="#cb15-523" aria-hidden="true" tabindex="-1"></a>A **confounder** is a variable that affects both T and Y, distorting the correlation between them. Using the example above. Confounders can be a measured covariate (a feature) or an unmeasured variable we don’t know about, and they generally cause problems, as the relationship between T and Y is really affected by data we cannot see. We commonly *assume that all confounders are observed* (this is also called **ignorability**).</span>
-<span id="cb15-524"><a href="#cb15-524" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-525"><a href="#cb15-525" aria-hidden="true" tabindex="-1"></a><span class="fu">### How to perform causal inference?</span></span>
-<span id="cb15-526"><a href="#cb15-526" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-527"><a href="#cb15-527" aria-hidden="true" tabindex="-1"></a>In a **randomized experiment**, participants are randomly assigned into two groups: treatment and control. A treatment is applied *only* to the treatment group; we assume ignorability and gather as many measurements as possible so that we can compare them between the control and treatment groups to determine whether or not the treatment is really the cause or just a confounding factor. </span>
-<span id="cb15-528"><a href="#cb15-528" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-529"><a href="#cb15-529" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;img</span> <span class="er">src</span><span class="ot">=</span><span class="st">"images/experiment.png"</span> <span class="er">alt</span><span class="ot">=</span><span class="st">'experiment'</span> <span class="er">width</span><span class="ot">=</span><span class="st">'600'</span><span class="kw">&gt;</span></span>
-<span id="cb15-530"><a href="#cb15-530" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-531"><a href="#cb15-531" aria-hidden="true" tabindex="-1"></a>However, often, randomly assigning treatments is impractical or unethical. For example, assigning a treatment of cigarettes to test the effect of smoking on lungs would not only be impractical but also unethical.</span>
-<span id="cb15-532"><a href="#cb15-532" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-533"><a href="#cb15-533" aria-hidden="true" tabindex="-1"></a>An alternative to bypass this issue is to utilize **observational studies**. This can be done by obtaining two participant groups separated based on some identified treatment variable. Unlike randomized experiments, however, we cannot assume ignorability: the participants could have separated into the two groups based on other covariates! In addition, there could also be unmeasured confounders.</span>
-<span id="cb15-534"><a href="#cb15-534" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-535"><a href="#cb15-535" aria-hidden="true" tabindex="-1"></a><span class="kw">&lt;img</span> <span class="er">src</span><span class="ot">=</span><span class="st">"images/observational.png"</span> <span class="er">alt</span><span class="ot">=</span><span class="st">'observational'</span> <span class="er">width</span><span class="ot">=</span><span class="st">'600'</span><span class="kw">&gt;</span></span>
-<span id="cb15-536"><a href="#cb15-536" aria-hidden="true" tabindex="-1"></a></span>
-</code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div></div></div></div></div>
-</div> <!-- /content -->
-
-
-
-</body></html>
\ No newline at end of file
diff --git a/inference_causality/inference_causality.qmd b/inference_causality/inference_causality.qmd
index d4e520fe..bad48b48 100644
--- a/inference_causality/inference_causality.qmd
+++ b/inference_causality/inference_causality.qmd
@@ -7,7 +7,7 @@ format:
     code-fold: true
     code-tools: true
     toc: true
-    toc-title: Causal Inference and the Bootstrap
+    toc-title: Causal Inference and Confounding
     page-layout: full
     theme:
       - cosmo
@@ -120,7 +120,117 @@ How well does bootstrapping actually represent our population? The bootstrapped
 In the real world, we don't know the population distribution. The center of the boostrapped distribution is the estimator applied to our original sample, so we have no way of recovering the estimator's true expected value; the center and spread of our bootstrap are *approximations*. The quality of our bootstrapped distribution also depends on the quality of our original sample; if our original sample was not representative of the population (like Sample 5 in the image above), then the bootstrap is next to useless. In general, bootstrapping works better for *large samples*, when the population distribution is *not heavily skewed* (no outliers), and when the estimator is *“low variance”* (insensitive to extreme values).
 
 ### Simple Bootstrap Example
-TODO
+Here we work through a simple example of the bootstrap when estimating the relationship between miles per gallon and the weight of a vehicle.
+
+Suppose we collected a sample of 20 cars from a population. For the purposes of this demo we will assume that the seaborn dataset is the population. The following is a visualization of our sample:
+
+```{python}
+#| code-fold: true
+#| vscode: {languageId: python}
+import numpy as np
+import pandas as pd
+import sklearn.linear_model as lm
+import seaborn as sns
+import matplotlib.pyplot as plt
+
+np.random.seed(42)
+mpg_sample = sns.load_dataset('mpg').sample(20)
+sns.regplot(mpg_sample, x='weight', y='mpg',ci=False);
+```
+
+Fitting a linear model we get an estimate of the slope:
+
+```{python}
+#| code-fold: false
+#| vscode: {languageId: python}
+model = lm.LinearRegression().fit(mpg_sample[['weight']], mpg_sample['mpg'])
+model.coef_[0]
+```
+
+#### Bootstrap Implementation
+Now let's use bootstrapping to estimate the distribution of that coefficient. Here we will construct a bootstrap function that takes an estimator function and uses that function to construct many bootstrap estimates of the slope.
+
+```{python}
+#| code-fold: false
+#| vscode: {languageId: python}
+def estimator(sample):
+    model = lm.LinearRegression().fit(sample[['weight']], sample['mpg'])
+    return model.coef_[0]
+```
+
+The code below uses [```df.sample```](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sample.html) to generate a bootstrap sample of the same size as the original sample.
+
+```{python}
+#| code-fold: false
+#| vscode: {languageId: python}
+def bootstrap(sample, statistic, num_repetitions):
+    """
+    Returns the statistic computed on a num_repetitions  
+    bootstrap samples from sample.
+    """
+    stats = []
+    for i in np.arange(num_repetitions):
+        # Step 1: Sample the Sample
+        bootstrap_sample = sample.sample(frac=1, replace=True)
+        # Step 2: compute statistics on the sample of the sample
+        bootstrap_stat = statistic(bootstrap_sample)
+        # Accumulate the statistics
+        stats.append(bootstrap_stat)
+    return stats    
+```
+
+After constructing MANY bootstrap slope estimates (in this case 10000), we can visualize the bootstrap distribution of the slope estimates.
+
+```{python}
+#| code-fold: true
+#| vscode: {languageId: python}
+bs_thetas = bootstrap(mpg_sample, estimator, 10000)
+fig = plt.subplots(dpi=120)
+sns.histplot(bs_thetas)
+plt.title('Bootstrap Distribution of the Slope');
+```
+
+#### Computing a Bootstrap CI
+We can now compute the confidence interval for the slope using the percentiles of the empirical distribution. Here, we are looking for a 95% confidence interval, so we are looking for the values at the 2.5 and 97.5 percentile in the bootstrap samples to be the bounds of our interval. We find the interval to be the range below:
+
+```{python}
+#| code-fold: true
+#| vscode: {languageId: python}
+def bootstrap_ci(bootstrap_samples, confidence_level=95):
+    """
+    Returns the confidence interval for the bootstrap samples.
+    """
+    lower_percentile = (100 - confidence_level) / 2
+    upper_percentile = 100 - lower_percentile
+    return np.percentile(bootstrap_samples, [lower_percentile, upper_percentile])
+print(bootstrap_ci(bs_thetas))
+```
+
+#### Comparing to the Population CIs
+In practice you don't have access to the population, but in this specific example we had taken a sample from a larger dataset that we can pretend is the population. Let's compare to resampling from the larger dataset. Here is the 95% confidence interval for the slope when sampling 10000 times from the whole data:
+
+```{python}
+#| code-fold: true
+#| vscode: {languageId: python}
+mpg_pop = sns.load_dataset('mpg')
+theta_est = [estimator(mpg_pop.sample(20)) for i in range(10000)]
+print(bootstrap_ci(theta_est))
+```
+
+Visualizing the two distributions:
+
+```{python}
+#| code-fold: true
+#| vscode: {languageId: python}
+fig = plt.subplots(dpi=120,figsize=(6,4))
+sns.histplot(bs_thetas, label='Bootstrap Thetas', alpha=0.7)
+sns.histplot(theta_est, label='Population Sampled Thetas', alpha=0.7)
+plt.legend()
+plt.xlabel('value')
+plt.title('Distribution of the Slope');
+```
+
+Comparing the two distributions, we see that our bootstrapped sample distribution does not exactly match the sampling distribution of the population, but it is relatively close. This demonstrates the benefit of bootstrapping, as without knowing the actual population distribution, we are able to roughly approximate the true slope for the model by using only a single random sample of 20 cars.
 
 <!-- #### PurpleAir (chose to skip this section because it's too complex for the amount of pedagogical value it adds)
 To show an example of this hypothesis testing process, we'll work with air quality measurement data. There are 2 common sources of air quality information: Air Quality System (AQS) and [PurpleAir sensors](https://www2.purpleair.com/). AQS is seen as the gold standard because it is high quality, well-calibrated, and publicly available. However, it is very expensive, and the sensors are far apart; reports are also delayed due to extensive calibration.  
@@ -150,7 +260,6 @@ PA - \theta_0 &\approx + \theta_1 \text{True Air Quality} \\
 $$
 :::
 
-```{python}
 #| code-fold: true
 import numpy as np
 import pandas as pd
@@ -195,7 +304,7 @@ GA = full_df.loc[(full_df['id'] == 'GA1') & (~full_df['date'].isin(bad_dates)) ,
 AQS, PA = GA[['pm25aqs']], GA['pm25pa']
 AQS.head()
 pd.DataFrame(PA).head()
-``` -->
+-->
 
 ## Collinearity
 
@@ -212,6 +321,8 @@ To show an example of this hypothesis testing process, we'll work with the [snow
 Note that `Egg Length` and `Egg Breadth` (widest diameter) are measured in millimeters, and `Egg Weight` and `Bird Weight` are measured in grams; for comparison, a standard paper clip weighs about one gram.
 
 ```{python}
+#| code-fold: true
+#| vscode: {languageId: python}
 import pandas as pd
 eggs = pd.read_csv("data/snowy_plover.csv")
 eggs.head(5)
@@ -235,6 +346,7 @@ Next, we use our data to fit a model $\hat{Y} = f_{\hat{\theta}}(x)$ that approx
 
 ```{python}
 #| code-fold: false
+#| vscode: {languageId: python}
 from sklearn.linear_model import LinearRegression
 import numpy as np
 
@@ -266,6 +378,7 @@ We draw a bootstrap sample, use this sample to fit a model, and record the resul
 
 ```{python}
 #| code-fold: false
+#| vscode: {languageId: python}
 # Set a random seed so you generate the same random sample as staff
 # In the "real world", we wouldn't do this
 import numpy as np
@@ -305,6 +418,7 @@ Our bootstrapped 95% confidence interval for $\theta_1$ is $[-0.259, 1.103]$. Im
 We can repeat this process to construct 95% confidence intervals for the other parameters of the model.
 
 ```{python}
+#| vscode: {languageId: python}
 np.random.seed(1337)
 
 theta_0_estimates = []
@@ -345,6 +459,7 @@ How can we explain this result? Think back to how we first interpreted the param
 To support this conclusion, we can visualize the relationships between our feature variables. Notice the strong positive association between the features.
 
 ```{python}
+#| vscode: {languageId: python}
 import seaborn as sns
 sns.pairplot(eggs[["egg_length", "egg_breadth", "egg_weight", 'bird_weight']]);
 ```
@@ -366,6 +481,7 @@ Let us now consider a more interpretable model: we instead assume a true relatio
 $$f_\theta(x) = \theta_0 + \theta_1 \text{egg\_weight} + \epsilon$$
 
 ```{python}
+#| vscode: {languageId: python}
 from sklearn.linear_model import LinearRegression
 X_int = eggs[["egg_weight"]]
 Y_int = eggs["bird_weight"]
@@ -383,6 +499,7 @@ pd.DataFrame({"theta_hat":[model_int.intercept_, thetas_int[0]]}, index=["theta_
 
 ```{python}
 #| code-fold: true
+#| vscode: {languageId: python}
 import matplotlib.pyplot as plt
 
 # Set a random seed so you generate the same random sample as staff
@@ -417,6 +534,7 @@ plt.title(r"Bootstrapped estimates $\hat{\theta}_1$ Under the Interpretable Mode
 Notice how the interpretable model performs almost as well as our other model:
 
 ```{python}
+#| vscode: {languageId: python}
 from sklearn.metrics import mean_squared_error
 
 rmse = mean_squared_error(Y, model.predict(X))
@@ -428,6 +546,7 @@ print(f'RMSE of Interpretable Model: {rmse_int}')
 Yet, the confidence interval for the true parameter $\theta_{1}$ does not contain zero.
 
 ```{python}
+#| vscode: {languageId: python}
 lower_int = np.percentile(estimates_int, 2.5)
 upper_int = np.percentile(estimates_int, 97.5)
 
@@ -514,4 +633,3 @@ An alternative to bypass this issue is to utilize **observational studies**. Thi
 
 <img src="images/observational.png" alt='observational' width='600'>
 
-

	egg_weight	egg_length	egg_breadth	bird_weight
0	7.4	28.80	21.84	5.2
1	7.7	29.04	22.45	5.4
2	7.9	29.36	22.48	5.6
3	7.5	30.10	21.71	5.3
4	8.3	30.17	22.75	5.9
	theta_hat
intercept	-4.605670
egg_weight	0.431229
egg_length	0.066570
egg_breadth	0.215914
	lower	upper
theta_0	-15.278542	5.161473
theta_1	-0.258648	1.103424
theta_2	-0.099138	0.208557
theta_3	-0.257141	0.758155