From 466a72e9742287268d2f5a29e260e6ea5a65308d Mon Sep 17 00:00:00 2001
From: Ian Dong <ihdong@berkeley.edu>
Date: Wed, 28 Feb 2024 22:51:08 -0800
Subject: [PATCH] Updated feature_engineering notes

---
 _quarto.yml                                  |    2 +-
 feature_engineering/feature_engineering.html | 1279 ------------------
 gradient_descent/gradient_descent.ipynb      |  965 +++++++++++++
 gradient_descent/gradient_descent.qmd        |    1 -
 4 files changed, 966 insertions(+), 1281 deletions(-)
 delete mode 100644 feature_engineering/feature_engineering.html
 create mode 100644 gradient_descent/gradient_descent.ipynb
diff --git a/_quarto.yml b/_quarto.yml
index 1e6c90e3..f8a4b473 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -29,7 +29,7 @@ book:
         - constant_model_loss_transformations/loss_transformations.qmd
         - ols/ols.qmd
         - gradient_descent/gradient_descent.qmd
-        # - feature_engineering/feature_engineering.qmd
+        - feature_engineering/feature_engineering.qmd
         # - case_study_HCE/case_study_HCE.qmd
         # - cv_regularization/cv_reg.qmd
         # - probability_1/probability_1.qmd
diff --git a/feature_engineering/feature_engineering.html b/feature_engineering/feature_engineering.html
deleted file mode 100644
index 6d91abb2..00000000
--- a/feature_engineering/feature_engineering.html
+++ /dev/null
@@ -1,1279 +0,0 @@
-<!DOCTYPE html>
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
-
-<meta charset="utf-8">
-<meta name="generator" content="quarto-1.4.548">
-
-<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
-
-
-<title>Feature Engineering</title>
-<style>
-code{white-space: pre-wrap;}
-span.smallcaps{font-variant: small-caps;}
-div.columns{display: flex; gap: min(4vw, 1.5em);}
-div.column{flex: auto; overflow-x: auto;}
-div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
-ul.task-list{list-style: none;}
-ul.task-list li input[type="checkbox"] {
-  width: 0.8em;
-  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
-  vertical-align: middle;
-}
-/* CSS for syntax highlighting */
-pre > code.sourceCode { white-space: pre; position: relative; }
-pre > code.sourceCode > span { line-height: 1.25; }
-pre > code.sourceCode > span:empty { height: 1.2em; }
-.sourceCode { overflow: visible; }
-code.sourceCode > span { color: inherit; text-decoration: inherit; }
-div.sourceCode { margin: 1em 0; }
-pre.sourceCode { margin: 0; }
-@media screen {
-div.sourceCode { overflow: auto; }
-}
-@media print {
-pre > code.sourceCode { white-space: pre-wrap; }
-pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
-}
-pre.numberSource code
-  { counter-reset: source-line 0; }
-pre.numberSource code > span
-  { position: relative; left: -4em; counter-increment: source-line; }
-pre.numberSource code > span > a:first-child::before
-  { content: counter(source-line);
-    position: relative; left: -1em; text-align: right; vertical-align: baseline;
-    border: none; display: inline-block;
-    -webkit-touch-callout: none; -webkit-user-select: none;
-    -khtml-user-select: none; -moz-user-select: none;
-    -ms-user-select: none; user-select: none;
-    padding: 0 4px; width: 4em;
-  }
-pre.numberSource { margin-left: 3em;  padding-left: 4px; }
-div.sourceCode
-  {   }
-@media screen {
-pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
-}
-</style>
-
-
-<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.5.1/jquery.min.js" integrity="sha512-bLT0Qm9VnAYZDflyKcBaQ2gg0hSYNQrJ8RilYldYQ1FxQYoCLtUjuuRuZo+fjqhx/qtq/1itJ0C2ejDxltZVFg==" crossorigin="anonymous"></script><script src="feature_engineering_files/libs/clipboard/clipboard.min.js"></script>
-<script src="feature_engineering_files/libs/quarto-html/quarto.js"></script>
-<script src="feature_engineering_files/libs/quarto-html/popper.min.js"></script>
-<script src="feature_engineering_files/libs/quarto-html/tippy.umd.min.js"></script>
-<script src="feature_engineering_files/libs/quarto-html/anchor.min.js"></script>
-<link href="feature_engineering_files/libs/quarto-html/tippy.css" rel="stylesheet">
-<link href="feature_engineering_files/libs/quarto-html/quarto-syntax-highlighting.css" rel="stylesheet" id="quarto-text-highlighting-styles">
-<script src="feature_engineering_files/libs/bootstrap/bootstrap.min.js"></script>
-<link href="feature_engineering_files/libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
-<link href="feature_engineering_files/libs/bootstrap/bootstrap.min.css" rel="stylesheet" id="quarto-bootstrap" data-mode="light">
-<script src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.6/require.min.js" integrity="sha512-c3Nl8+7g4LMSTdrm621y7kf9v3SDPnhxLNhcjFJbKECVnmZHTdo+IRO05sNLTH/D3vA6u1X32ehoLC7WFVdheg==" crossorigin="anonymous"></script>
-
-<script type="application/javascript">define('jquery', [],function() {return window.jQuery;})</script>
-
-  <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
-  <script src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-chtml-full.js" type="text/javascript"></script>
-
-<script type="text/javascript">
-const typesetMath = (el) => {
-  if (window.MathJax) {
-    // MathJax Typeset
-    window.MathJax.typeset([el]);
-  } else if (window.katex) {
-    // KaTeX Render
-    var mathElements = el.getElementsByClassName("math");
-    var macros = [];
-    for (var i = 0; i < mathElements.length; i++) {
-      var texText = mathElements[i].firstChild;
-      if (mathElements[i].tagName == "SPAN") {
-        window.katex.render(texText.data, mathElements[i], {
-          displayMode: mathElements[i].classList.contains('display'),
-          throwOnError: false,
-          macros: macros,
-          fleqn: false
-        });
-      }
-    }
-  }
-}
-window.Quarto = {
-  typesetMath
-};
-</script>
-
-</head>
-
-<body>
-
-<div id="quarto-content" class="page-columns page-rows-contents page-layout-full">
-<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
-  <nav id="TOC" role="doc-toc" class="toc-active">
-    <h2 id="toc-title">Feature Engineering</h2>
-   
-  <ul>
-  <li><a href="#feature-engineering" id="toc-feature-engineering" class="nav-link active" data-scroll-target="#feature-engineering">Feature Engineering</a></li>
-  <li><a href="#feature-functions" id="toc-feature-functions" class="nav-link" data-scroll-target="#feature-functions">Feature Functions</a></li>
-  <li><a href="#one-hot-encoding" id="toc-one-hot-encoding" class="nav-link" data-scroll-target="#one-hot-encoding">One Hot Encoding</a></li>
-  <li><a href="#polynomial-features" id="toc-polynomial-features" class="nav-link" data-scroll-target="#polynomial-features">Polynomial Features</a></li>
-  <li><a href="#complexity-and-overfitting" id="toc-complexity-and-overfitting" class="nav-link" data-scroll-target="#complexity-and-overfitting">Complexity and Overfitting</a></li>
-  </ul>
-</nav>
-</div>
-<main class="content column-page-left" id="quarto-document-content">
-
-<header id="title-block-header" class="quarto-title-block default">
-<div class="quarto-title">
-<div class="quarto-title-block"><div><h1 class="title">Feature Engineering</h1><button type="button" class="btn code-tools-button dropdown-toggle" id="quarto-code-tools-menu" data-bs-toggle="dropdown" aria-expanded="false"><i class="bi"></i> Code</button><ul class="dropdown-menu dropdown-menu-end" aria-labelelledby="quarto-code-tools-menu"><li><a id="quarto-show-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Show All Code</a></li><li><a id="quarto-hide-all-code" class="dropdown-item" href="javascript:void(0)" role="button">Hide All Code</a></li><li><hr class="dropdown-divider"></li><li><a id="quarto-view-source" class="dropdown-item" href="javascript:void(0)" role="button">View Source</a></li></ul></div></div>
-</div>
-
-
-
-<div class="quarto-title-meta column-page-left">
-
-    
-  
-    
-  </div>
-  
-
-
-</header>
-
-
-<div class="callout callout-style-default callout-note no-icon callout-titled">
-<div class="callout-header d-flex align-content-center" data-bs-toggle="collapse" data-bs-target=".callout-1-contents" aria-controls="callout-1" aria-expanded="true" aria-label="Toggle callout">
-<div class="callout-icon-container">
-<i class="callout-icon no-icon"></i>
-</div>
-<div class="callout-title-container flex-fill">
-Learning Outcomes
-</div>
-<div class="callout-btn-toggle d-inline-block border-0 py-1 ps-1 pe-0 float-end"><i class="callout-toggle"></i></div>
-</div>
-<div id="callout-1" class="callout-1-contents callout-collapse collapse show">
-<div class="callout-body-container callout-body">
-<ul>
-<li>Recognize the value of feature engineering as a tool to improve model performance</li>
-<li>Implement polynomial feature generation and one hot encoding</li>
-<li>Understand the interactions between model complexity, model variance, and training error</li>
-</ul>
-</div>
-</div>
-</div>
-<p>At this point, we’ve grown quite familiar with the modeling process. We’ve introduced the concept of loss, used it to fit several types of models, and, most recently, extended our analysis to multiple regression. Along the way, we’ve forged our way through the mathematics of deriving the optimal model parameters in all its gory detail. It’s time to make our lives a little easier – let’s implement the modeling process in code!</p>
-<p>In this lecture, we’ll explore two techniques for model fitting:</p>
-<ol type="1">
-<li>Translating our derived formulas for regression to <code>python</code></li>
-<li>Using <code>python</code>’s <code>sklearn</code> package</li>
-</ol>
-<p>With our new programming frameworks in hand, we will also add sophistication to our models by introducing more complex features to enhance model performance.</p>
-<section id="feature-engineering" class="level2">
-<h2 class="anchored" data-anchor-id="feature-engineering">Feature Engineering</h2>
-<p>At this point in the course, we’ve equipped ourselves with some powerful techniques to build and optimize models. We’ve explored how to develop models of multiple variables, as well as how to transform variables to help <strong>linearize</strong> a dataset and fit these models to maximize their performance.</p>
-<p>All of this was done with one major caveat: the regression models we’ve worked with so far are all <strong>linear in the input variables</strong>. We’ve assumed that our predictions should be some combination of linear variables. While this works well in some cases, the real world isn’t always so straightforward. We’ll learn an important method to address this issue – feature engineering – and consider some new problems that can arise when we do so.</p>
-<p>Feature engineering is the process of <em>transforming</em> raw features into <em>more informative features</em> that can be used in modeling or EDA tasks and improve model performance.</p>
-<p>Feature engineering allows you to:</p>
-<ul>
-<li>Capture domain knowledge</li>
-<li>Express non-linear relationships using linear models</li>
-<li>Use non-numeric (qualitative) features in models</li>
-</ul>
-</section>
-<section id="feature-functions" class="level2">
-<h2 class="anchored" data-anchor-id="feature-functions">Feature Functions</h2>
-<p>A <strong>feature function</strong> describes the transformations we apply to raw features in a dataset to create a design matrix of transformed features. We typically denote the feature function as <span class="math inline">\(\Phi\)</span> (think to yourself: “phi”-true function). When we apply the feature function to our original dataset <span class="math inline">\(\mathbb{X}\)</span>, the result, <span class="math inline">\(\Phi(\mathbb{X})\)</span>, is a transformed design matrix ready to be used in modeling.</p>
-<p>For example, we might design a feature function that computes the square of an existing feature and adds it to the design matrix. In this case, our existing matrix <span class="math inline">\([x]\)</span> is transformed to <span class="math inline">\([x, x^2]\)</span>. Its <em>dimension</em> increases from 1 to 2. Often, the dimension of the <em>featurized</em> dataset increases as seen here.</p>
-<center>
-<img src="images/phi.png" alt="phi" width="700">
-</center>
-<p>The new features introduced by the feature function can then be used in modeling. Often, we use the symbol <span class="math inline">\(\phi_i\)</span> to represent transformed features after feature engineering.</p>
-<p><span class="math display">\[\hat{y} = \theta_1 x + \theta_2 x^2\]</span> <span class="math display">\[\hat{y}= \theta_1 \phi_1 + \theta_2 \phi_2\]</span></p>
-<p>In matrix notation, the symbol <span class="math inline">\(\Phi\)</span> is sometimes used to denote the design matrix after feature engineering has been performed. Note that in the usage below, <span class="math inline">\(\Phi\)</span> is now a feature-engineered matrix, rather than a function.</p>
-<p><span class="math display">\[\hat{\mathbb{Y}} = \Phi \theta\]</span></p>
-<p>More formally, we describe a feature function as transforming the original <span class="math inline">\(\mathbb{R}^{n \times p}\)</span> dataset <span class="math inline">\(\mathbb{X}\)</span> to a featurized <span class="math inline">\(\mathbb{R}^{n \times p'}\)</span> dataset <span class="math inline">\(\mathbb{\Phi}\)</span>, where <span class="math inline">\(p'\)</span> is typically greater than <span class="math inline">\(p\)</span>.</p>
-<p><span class="math display">\[\mathbb{X} \in \mathbb{R}^{n \times p} \longrightarrow \Phi \in \mathbb{R}^{n \times p'}\]</span></p>
-</section>
-<section id="one-hot-encoding" class="level2">
-<h2 class="anchored" data-anchor-id="one-hot-encoding">One Hot Encoding</h2>
-<p>Feature engineering opens up a whole new set of possibilities for designing better-performing models. As you will see in lab and homework, feature engineering is one of the most important parts of the entire modeling process.</p>
-<p>A particularly powerful use of feature engineering is to allow us to perform regression on <em>non-numeric</em> features. <strong>One hot encoding</strong> is a feature engineering technique that generates numeric features from categorical data, allowing us to use our usual methods to fit a regression model on the data.</p>
-<p>To illustrate how this works, we’ll refer back to the <code>tips</code> dataset from previous lectures. Consider the <code>"day"</code> column of the dataset:</p>
-<div id="9b659264" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="1">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sklearn.linear_model <span class="im">as</span> lm</span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>tips <span class="op">=</span> sns.load_dataset(<span class="st">"tips"</span>)</span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>tips.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-display" data-execution_count="1">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">total_bill</th>
-<th data-quarto-table-cell-role="th">tip</th>
-<th data-quarto-table-cell-role="th">sex</th>
-<th data-quarto-table-cell-role="th">smoker</th>
-<th data-quarto-table-cell-role="th">day</th>
-<th data-quarto-table-cell-role="th">time</th>
-<th data-quarto-table-cell-role="th">size</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>16.99</td>
-<td>1.01</td>
-<td>Female</td>
-<td>No</td>
-<td>Sun</td>
-<td>Dinner</td>
-<td>2</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>10.34</td>
-<td>1.66</td>
-<td>Male</td>
-<td>No</td>
-<td>Sun</td>
-<td>Dinner</td>
-<td>3</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>21.01</td>
-<td>3.50</td>
-<td>Male</td>
-<td>No</td>
-<td>Sun</td>
-<td>Dinner</td>
-<td>3</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>23.68</td>
-<td>3.31</td>
-<td>Male</td>
-<td>No</td>
-<td>Sun</td>
-<td>Dinner</td>
-<td>2</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>24.59</td>
-<td>3.61</td>
-<td>Female</td>
-<td>No</td>
-<td>Sun</td>
-<td>Dinner</td>
-<td>4</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<p>At first glance, it doesn’t seem possible to fit a regression model to this data – we can’t directly perform any mathematical operations on the entry “Sun”.</p>
-<p>To resolve this, we instead create a new table with a feature for each unique value in the original <code>"day"</code> column. We then iterate through the <code>"day"</code> column. For each entry in <code>"day"</code> we fill the corresponding feature in the new table with 1. All other features are set to 0.</p>
-<center>
-<img src="images/ohe.png" alt="ohe" width="600">
-</center>
-<p><br></p>
-In short, each category of a categorical variable gets its own feature
-<ul>
-<li>
-Value = 1 if a row belongs to the category
-</li>
-<li>
-Value = 0 otherwise
-</li>
-</ul>
-<p>The <code>OneHotEncoder</code> class of <code>sklearn</code> (<a href="https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder.get_feature_names_out">documentation</a>) offers a quick way to perform this one-hot encoding. You will explore its use in detail in the lab. For now, recognize that we follow a very similar workflow to when we were working with the <code>LinearRegression</code> class: we initialize a <code>OneHotEncoder</code> object, fit it to our data, and finally use <code>.transform</code> to apply the fitted encoder.</p>
-<div id="82b5a137" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="2">
-<div class="sourceCode cell-code" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> OneHotEncoder</span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Initialize a OneHotEncoder object</span></span>
-<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>ohe <span class="op">=</span> OneHotEncoder()</span>
-<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit the encoder</span></span>
-<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>ohe.fit(tips[[<span class="st">"day"</span>]])</span>
-<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a><span class="co"># Use the encoder to transform the raw "day" feature</span></span>
-<span id="cb2-10"><a href="#cb2-10" aria-hidden="true" tabindex="-1"></a>encoded_day <span class="op">=</span> ohe.transform(tips[[<span class="st">"day"</span>]]).toarray()</span>
-<span id="cb2-11"><a href="#cb2-11" aria-hidden="true" tabindex="-1"></a>encoded_day_df <span class="op">=</span> pd.DataFrame(encoded_day, columns<span class="op">=</span>ohe.get_feature_names_out())</span>
-<span id="cb2-12"><a href="#cb2-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb2-13"><a href="#cb2-13" aria-hidden="true" tabindex="-1"></a>encoded_day_df.head()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="2">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">day_Fri</th>
-<th data-quarto-table-cell-role="th">day_Sat</th>
-<th data-quarto-table-cell-role="th">day_Sun</th>
-<th data-quarto-table-cell-role="th">day_Thur</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>0.0</td>
-<td>0.0</td>
-<td>1.0</td>
-<td>0.0</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<p>The one-hot encoded features can then be used in the design matrix to train a model:</p>
-<center>
-<img src="images/ohemodel.png" alt="ohemodel" width="600">
-</center>
-<p><span class="math display">\[\hat{y} = \theta_1 (\text{total}\_\text{bill}) + \theta_2 (\text{size}) + \theta_3 (\text{day}\_\text{Fri}) + \theta_4 (\text{day}\_\text{Sat}) + \theta_5 (\text{day}\_\text{Sun}) + \theta_6 (\text{day}\_\text{Thur})\]</span></p>
-<p>Or in shorthand:</p>
-<p><span class="math display">\[\hat{y} = \theta_{1}\phi_{1} + \theta_{2}\phi_{2} + \theta_{3}\phi_{3} + \theta_{4}\phi_{4} + \theta_{5}\phi_{5} + \theta_{6}\phi_{6}\]</span></p>
-<p>Now, the <code>day</code> feature (or rather, the four new boolean features that represent day) can be used to fit a model.</p>
-<p>Using <code>sklearn</code> to fit the new model, we can determine the model coefficients, allowing us to understand how each feature impacts the predicted tip.</p>
-<div id="d4c7c31f" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="3">
-<div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>data_w_ohe <span class="op">=</span> tips[[<span class="st">"total_bill"</span>, <span class="st">"size"</span>, <span class="st">"day"</span>]].join(encoded_day_df).drop(columns <span class="op">=</span> <span class="st">"day"</span>)</span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>ohe_model <span class="op">=</span> lm.LinearRegression(fit_intercept<span class="op">=</span><span class="va">False</span>) <span class="co">#Tell sklearn to not add an additional bias column. Why?</span></span>
-<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>ohe_model.fit(data_w_ohe, tips[<span class="st">"tip"</span>])</span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"Feature"</span>:data_w_ohe.columns, <span class="st">"Model Coefficient"</span>:ohe_model.coef_})</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-display" data-execution_count="3">
-<div>
-
-
-<table class="dataframe table table-sm table-striped small" data-quarto-postprocess="true" data-border="1">
-<thead>
-<tr class="header">
-<th data-quarto-table-cell-role="th"></th>
-<th data-quarto-table-cell-role="th">Feature</th>
-<th data-quarto-table-cell-role="th">Model Coefficient</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">0</td>
-<td>total_bill</td>
-<td>0.092994</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">1</td>
-<td>size</td>
-<td>0.187132</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">2</td>
-<td>day_Fri</td>
-<td>0.745787</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">3</td>
-<td>day_Sat</td>
-<td>0.621129</td>
-</tr>
-<tr class="odd">
-<td data-quarto-table-cell-role="th">4</td>
-<td>day_Sun</td>
-<td>0.732289</td>
-</tr>
-<tr class="even">
-<td data-quarto-table-cell-role="th">5</td>
-<td>day_Thur</td>
-<td>0.668294</td>
-</tr>
-</tbody>
-</table>
-
-</div>
-</div>
-</div>
-<p>For example, when looking at the coefficient for <code>day_Fri</code>, we can understand how much the fact that it is Friday impacts the predicted tip.</p>
-<p>When one-hot encoding, keep in mind that any set of one-hot encoded columns will always sum to a column of all ones, representing the bias column. More formally, the bias column is a linear combination of the OHE columns.</p>
-<center>
-<img src="images/bias.png" alt="bias" width="600">
-</center>
-<p>We must be careful not to include this bias column in our design matrix. Otherwise, there will be linear dependence in the model, meaning <span class="math inline">\(\mathbb{X}^{\top}\mathbb{X}\)</span> would no longer be invertible, and our OLS estimate <span class="math inline">\(\hat{\theta} = (\mathbb{X}^{\top}\mathbb{X})^{-1}\mathbb{X}^{\top}\mathbb{Y}\)</span> fails.</p>
-<p>To resolve this issue, we simply omit one of the one-hot encoded columns <em>or</em> do not include an intercept term. The adjusted design matrices are shown below.</p>
-<center>
-<img src="images/remove.png" alt="remove" width="600">
-</center>
-<p>Either approach works — we still retain the same information as the omitted column being a linear combination of the remaining columns.</p>
-</section>
-<section id="polynomial-features" class="level2">
-<h2 class="anchored" data-anchor-id="polynomial-features">Polynomial Features</h2>
-<p>We have encountered a few cases now where models with linear features have performed poorly on datasets that show clear non-linear curvature.</p>
-<p>As an example, consider the <code>vehicles</code> dataset, which contains information about cars. Suppose we want to use the <code>hp</code> (horsepower) of a car to predict its <code>"mpg"</code> (gas mileage in miles per gallon). If we visualize the relationship between these two variables, we see a non-linear curvature. Fitting a linear model to these variables results in a high (poor) value of RMSE.</p>
-<p><span class="math display">\[\hat{y} = \theta_0 + \theta_1 (\text{hp})\]</span></p>
-<div id="0ac83a07" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="4">
-<details class="code-fold">
-<summary>Code</summary>
-<div class="sourceCode cell-code" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>pd.options.mode.chained_assignment <span class="op">=</span> <span class="va">None</span> </span>
-<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>vehicles <span class="op">=</span> sns.load_dataset(<span class="st">"mpg"</span>).dropna().rename(columns <span class="op">=</span> {<span class="st">"horsepower"</span>: <span class="st">"hp"</span>}).sort_values(<span class="st">"hp"</span>)</span>
-<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> vehicles[[<span class="st">"hp"</span>]]</span>
-<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> vehicles[<span class="st">"mpg"</span>]</span>
-<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>hp_model <span class="op">=</span> lm.LinearRegression()</span>
-<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>hp_model.fit(X, Y)</span>
-<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>hp_model_predictions <span class="op">=</span> hp_model.predict(X)</span>
-<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
-<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(data<span class="op">=</span>vehicles, x<span class="op">=</span><span class="st">"hp"</span>, y<span class="op">=</span><span class="st">"mpg"</span>)</span>
-<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>plt.plot(vehicles[<span class="st">"hp"</span>], hp_model_predictions, c<span class="op">=</span><span class="st">"tab:red"</span>)<span class="op">;</span></span>
-<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"MSE of model with (hp) feature: </span><span class="sc">{</span>np<span class="sc">.</span>mean((Y<span class="op">-</span>hp_model_predictions)<span class="op">**</span><span class="dv">2</span>)<span class="sc">}</span><span class="ss">"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</details>
-<div class="cell-output cell-output-stdout">
-<pre><code>MSE of model with (hp) feature: 23.943662938603104</code></pre>
-</div>
-<div class="cell-output cell-output-display">
-<div>
-<figure class="figure">
-<p><img src="feature_engineering_files/figure-html/cell-5-output-2.png" width="585" height="429" class="figure-img"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>To capture non-linearity in a dataset, it makes sense to incorporate <strong>non-linear</strong> features. Let’s introduce a <strong>polynomial</strong> term, <span class="math inline">\(\text{hp}^2\)</span>, into our regression model. The model now takes the form:</p>
-<p><span class="math display">\[\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2)\]</span> <span class="math display">\[\hat{y} = \theta_0 + \theta_1 \phi_1 + \theta_2 \phi_2\]</span></p>
-<p>How can we fit a model with non-linear features? We can use the exact same techniques as before: ordinary least squares, gradient descent, or <code>sklearn</code>. This is because our new model is still a <strong>linear model</strong>. Although it contains non-linear <em>features</em>, it is linear with respect to the model <em>parameters</em>. All of our previous work on fitting models was done under the assumption that we were working with linear models. Because our new model is still linear, we can apply our existing methods to determine the optimal parameters.</p>
-<div id="5840e673" class="cell" data-vscode="{&quot;languageId&quot;:&quot;python&quot;}" data-execution_count="5">
-<div class="sourceCode cell-code" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="co"># Add a hp^2 feature to the design matrix</span></span>
-<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> vehicles[[<span class="st">"hp"</span>]]</span>
-<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a>X[<span class="st">"hp^2"</span>] <span class="op">=</span> vehicles[<span class="st">"hp"</span>]<span class="op">**</span><span class="dv">2</span></span>
-<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="co"># Use sklearn to fit the model</span></span>
-<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a>hp2_model <span class="op">=</span> lm.LinearRegression()</span>
-<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a>hp2_model.fit(X, Y)</span>
-<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a>hp2_model_predictions <span class="op">=</span> hp2_model.predict(X)</span>
-<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(data<span class="op">=</span>vehicles, x<span class="op">=</span><span class="st">"hp"</span>, y<span class="op">=</span><span class="st">"mpg"</span>)</span>
-<span id="cb6-11"><a href="#cb6-11" aria-hidden="true" tabindex="-1"></a>plt.plot(vehicles[<span class="st">"hp"</span>], hp2_model_predictions, c<span class="op">=</span><span class="st">"tab:red"</span>)<span class="op">;</span></span>
-<span id="cb6-12"><a href="#cb6-12" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb6-13"><a href="#cb6-13" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"MSE of model with (hp^2) feature: </span><span class="sc">{</span>np<span class="sc">.</span>mean((Y<span class="op">-</span>hp2_model_predictions)<span class="op">**</span><span class="dv">2</span>)<span class="sc">}</span><span class="ss">"</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<div class="cell-output cell-output-stdout">
-<pre><code>MSE of model with (hp^2) feature: 18.98476890761722</code></pre>
-</div>
-<div class="cell-output cell-output-display">
-<div>
-<figure class="figure">
-<p><img src="feature_engineering_files/figure-html/cell-6-output-2.png" width="585" height="429" class="figure-img"></p>
-</figure>
-</div>
-</div>
-</div>
-<p>Looking a lot better! By incorporating a squared feature, we are able to capture the curvature of the dataset. Our model is now a parabola centered on our data. Notice that our new model’s error has decreased relative to the original model with linear features.</p>
-</section>
-<section id="complexity-and-overfitting" class="level2">
-<h2 class="anchored" data-anchor-id="complexity-and-overfitting">Complexity and Overfitting</h2>
-<p>We’ve seen now that feature engineering allows us to build all sorts of features to improve the performance of the model. In particular, we saw that designing a more complex feature (squaring <code>hp</code> in the <code>vehicles</code> data previously) substantially improved the model’s ability to capture non-linear relationships. To take full advantage of this, we might be inclined to design increasingly complex features. Consider the following three models, each of different order (the maximum exponent power of each model):</p>
-<ul>
-<li>Model with order 2: <span class="math inline">\(\hat{\text{mpg}} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2)\)</span></li>
-<li>Model with order 3: <span class="math inline">\(\hat{\text{mpg}} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2) + \theta_3 (\text{hp}^3)\)</span></li>
-<li>Model with order 4: <span class="math inline">\(\hat{\text{mpg}} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2) + \theta_3 (\text{hp}^3) + \theta_4 (\text{hp}^4)\)</span></li>
-</ul>
-<p><br></p>
-<center>
-<img src="images/degree_comparison.png" alt="degree_comparison" width="900">
-</center>
-<p>As we can see in the plots above, MSE continues to decrease with each additional polynomial term. To visualize it further, let’s plot models as the complexity increases from 0 to 6:</p>
-<center>
-<img src="images/degree_comparison2.png" alt="degree_comparison" width="900">
-</center>
-<p>When we use our model to make predictions on the same data that was used to fit the model, we find that the MSE decreases with each additional polynomial term (as our model gets more complex). The <strong>training error</strong> is the model’s error when generating predictions from the same data that was used for training purposes. We can conclude that the training error goes down as the complexity of the model increases.</p>
-<center>
-<img src="images/train_error.png" alt="train_error" width="400">
-</center>
-<p>This seems like good news – when working on the <strong>training data</strong>, we can improve model performance by designing increasingly complex models.</p>
-<blockquote class="blockquote">
-<p><strong>Math Fact</strong>: given <span class="math inline">\(N\)</span> overlapping data points, we can always find a polynomial of degree <span class="math inline">\(N-1\)</span> that goes through all those points.</p>
-For example: there always exists a degree-4 polynomial curve that can perfectly model a dataset of 5 datapoints
-<center>
-<img src="images/perfect_poly_fits.png" alt="train_error" width="600">
-</center>
-</blockquote>
-<p>However, high model complexity comes with its own set of issues. When building the <code>vehicles</code> models above, we trained the models on the <em>entire</em> dataset and then evaluated their performance on this same dataset. In reality, we are likely to instead train the model on a <em>sample</em> from the population, then use it to make predictions on data it didn’t encounter during training.</p>
-<p>Let’s walk through a more realistic example. Say we are given a training dataset of just 6 datapoints and want to train a model to then make predictions on a <em>different</em> set of points. We may be tempted to make a highly complex model (e.g., degree 5), especially given it makes perfect predictions on the training data as clear on the left. However, as shown in the graph on the right, this model would perform <em>horribly</em> on the rest of the population!</p>
-<center>
-<img src="images/complex.png" alt="complex" width="600">
-</center>
-<p>The phenomenon above is called <strong>overfitting</strong>. The model effectively just memorized the training data it encountered when it was fitted, leaving it unable to <strong>generalize</strong> well to data it didn’t encounter during training. This is a problem: we want models that are generalizable to “unseen” data.</p>
-<p>Additionally, since complex models are sensitive to the specific dataset used to train them, they have high <strong>variance</strong>. A model with high variance tends to <em>vary</em> more dramatically when trained on different datasets. Going back to our example above, we can see our degree-5 model varies erratically when we fit it to different samples of 6 points from <code>vehicles</code>.</p>
-<center>
-<img src="images/resamples.png" alt="resamples" width="800">
-</center>
-<p>We now face a dilemma: we know that we can <strong>decrease training error</strong> by increasing model complexity, but models that are <em>too</em> complex start to overfit and can’t be reapplied to new datasets due to <strong>high variance</strong>.</p>
-<center>
-<img src="images/bvt.png" alt="bvt" width="400">
-</center>
-<p>We can see that there is a clear trade-off that comes from the complexity of our model. As model complexity increases, the model’s error on the training data decreases. At the same time, the model’s variance tends to increase.</p>
-<p>The takeaway here: we need to strike a balance in the complexity of our models; we want models that are generalizable to “unseen” data. A model that is too simple won’t be able to capture the key relationships between our variables of interest; a model that is too complex runs the risk of overfitting.</p>
-<p>This begs the question: how do we control the complexity of a model? Stay tuned for our Lecture 17 on Cross-Validation and Regularization!</p>
-<!-- -->
-
-</section>
-
-</main>
-<!-- /main column -->
-<script id="quarto-html-after-body" type="application/javascript">
-window.document.addEventListener("DOMContentLoaded", function (event) {
-  const toggleBodyColorMode = (bsSheetEl) => {
-    const mode = bsSheetEl.getAttribute("data-mode");
-    const bodyEl = window.document.querySelector("body");
-    if (mode === "dark") {
-      bodyEl.classList.add("quarto-dark");
-      bodyEl.classList.remove("quarto-light");
-    } else {
-      bodyEl.classList.add("quarto-light");
-      bodyEl.classList.remove("quarto-dark");
-    }
-  }
-  const toggleBodyColorPrimary = () => {
-    const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
-    if (bsSheetEl) {
-      toggleBodyColorMode(bsSheetEl);
-    }
-  }
-  toggleBodyColorPrimary();  
-  const icon = "";
-  const anchorJS = new window.AnchorJS();
-  anchorJS.options = {
-    placement: 'right',
-    icon: icon
-  };
-  anchorJS.add('.anchored');
-  const isCodeAnnotation = (el) => {
-    for (const clz of el.classList) {
-      if (clz.startsWith('code-annotation-')) {                     
-        return true;
-      }
-    }
-    return false;
-  }
-  const clipboard = new window.ClipboardJS('.code-copy-button', {
-    text: function(trigger) {
-      const codeEl = trigger.previousElementSibling.cloneNode(true);
-      for (const childEl of codeEl.children) {
-        if (isCodeAnnotation(childEl)) {
-          childEl.remove();
-        }
-      }
-      return codeEl.innerText;
-    }
-  });
-  clipboard.on('success', function(e) {
-    // button target
-    const button = e.trigger;
-    // don't keep focus
-    button.blur();
-    // flash "checked"
-    button.classList.add('code-copy-button-checked');
-    var currentTitle = button.getAttribute("title");
-    button.setAttribute("title", "Copied!");
-    let tooltip;
-    if (window.bootstrap) {
-      button.setAttribute("data-bs-toggle", "tooltip");
-      button.setAttribute("data-bs-placement", "left");
-      button.setAttribute("data-bs-title", "Copied!");
-      tooltip = new bootstrap.Tooltip(button, 
-        { trigger: "manual", 
-          customClass: "code-copy-button-tooltip",
-          offset: [0, -8]});
-      tooltip.show();    
-    }
-    setTimeout(function() {
-      if (tooltip) {
-        tooltip.hide();
-        button.removeAttribute("data-bs-title");
-        button.removeAttribute("data-bs-toggle");
-        button.removeAttribute("data-bs-placement");
-      }
-      button.setAttribute("title", currentTitle);
-      button.classList.remove('code-copy-button-checked');
-    }, 1000);
-    // clear code selection
-    e.clearSelection();
-  });
-  const viewSource = window.document.getElementById('quarto-view-source') ||
-                     window.document.getElementById('quarto-code-tools-source');
-  if (viewSource) {
-    const sourceUrl = viewSource.getAttribute("data-quarto-source-url");
-    viewSource.addEventListener("click", function(e) {
-      if (sourceUrl) {
-        // rstudio viewer pane
-        if (/\bcapabilities=\b/.test(window.location)) {
-          window.open(sourceUrl);
-        } else {
-          window.location.href = sourceUrl;
-        }
-      } else {
-        const modal = new bootstrap.Modal(document.getElementById('quarto-embedded-source-code-modal'));
-        modal.show();
-      }
-      return false;
-    });
-  }
-  function toggleCodeHandler(show) {
-    return function(e) {
-      const detailsSrc = window.document.querySelectorAll(".cell > details > .sourceCode");
-      for (let i=0; i<detailsSrc.length; i++) {
-        const details = detailsSrc[i].parentElement;
-        if (show) {
-          details.open = true;
-        } else {
-          details.removeAttribute("open");
-        }
-      }
-      const cellCodeDivs = window.document.querySelectorAll(".cell > .sourceCode");
-      const fromCls = show ? "hidden" : "unhidden";
-      const toCls = show ? "unhidden" : "hidden";
-      for (let i=0; i<cellCodeDivs.length; i++) {
-        const codeDiv = cellCodeDivs[i];
-        if (codeDiv.classList.contains(fromCls)) {
-          codeDiv.classList.remove(fromCls);
-          codeDiv.classList.add(toCls);
-        } 
-      }
-      return false;
-    }
-  }
-  const hideAllCode = window.document.getElementById("quarto-hide-all-code");
-  if (hideAllCode) {
-    hideAllCode.addEventListener("click", toggleCodeHandler(false));
-  }
-  const showAllCode = window.document.getElementById("quarto-show-all-code");
-  if (showAllCode) {
-    showAllCode.addEventListener("click", toggleCodeHandler(true));
-  }
-  function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
-    const config = {
-      allowHTML: true,
-      maxWidth: 500,
-      delay: 100,
-      arrow: false,
-      appendTo: function(el) {
-          return el.parentElement;
-      },
-      interactive: true,
-      interactiveBorder: 10,
-      theme: 'quarto',
-      placement: 'bottom-start',
-    };
-    if (contentFn) {
-      config.content = contentFn;
-    }
-    if (onTriggerFn) {
-      config.onTrigger = onTriggerFn;
-    }
-    if (onUntriggerFn) {
-      config.onUntrigger = onUntriggerFn;
-    }
-    window.tippy(el, config); 
-  }
-  const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
-  for (var i=0; i<noterefs.length; i++) {
-    const ref = noterefs[i];
-    tippyHover(ref, function() {
-      // use id or data attribute instead here
-      let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
-      try { href = new URL(href).hash; } catch {}
-      const id = href.replace(/^#\/?/, "");
-      const note = window.document.getElementById(id);
-      return note.innerHTML;
-    });
-  }
-  const xrefs = window.document.querySelectorAll('a.quarto-xref');
-  const processXRef = (id, note) => {
-    // Strip column container classes
-    const stripColumnClz = (el) => {
-      el.classList.remove("page-full", "page-columns");
-      if (el.children) {
-        for (const child of el.children) {
-          stripColumnClz(child);
-        }
-      }
-    }
-    stripColumnClz(note)
-    if (id === null || id.startsWith('sec-')) {
-      // Special case sections, only their first couple elements
-      const container = document.createElement("div");
-      if (note.children && note.children.length > 2) {
-        container.appendChild(note.children[0].cloneNode(true));
-        for (let i = 1; i < note.children.length; i++) {
-          const child = note.children[i];
-          if (child.tagName === "P" && child.innerText === "") {
-            continue;
-          } else {
-            container.appendChild(child.cloneNode(true));
-            break;
-          }
-        }
-        if (window.Quarto?.typesetMath) {
-          window.Quarto.typesetMath(container);
-        }
-        return container.innerHTML
-      } else {
-        if (window.Quarto?.typesetMath) {
-          window.Quarto.typesetMath(note);
-        }
-        return note.innerHTML;
-      }
-    } else {
-      // Remove any anchor links if they are present
-      const anchorLink = note.querySelector('a.anchorjs-link');
-      if (anchorLink) {
-        anchorLink.remove();
-      }
-      if (window.Quarto?.typesetMath) {
-        window.Quarto.typesetMath(note);
-      }
-      // TODO in 1.5, we should make sure this works without a callout special case
-      if (note.classList.contains("callout")) {
-        return note.outerHTML;
-      } else {
-        return note.innerHTML;
-      }
-    }
-  }
-  for (var i=0; i<xrefs.length; i++) {
-    const xref = xrefs[i];
-    tippyHover(xref, undefined, function(instance) {
-      instance.disable();
-      let url = xref.getAttribute('href');
-      let hash = undefined; 
-      if (url.startsWith('#')) {
-        hash = url;
-      } else {
-        try { hash = new URL(url).hash; } catch {}
-      }
-      if (hash) {
-        const id = hash.replace(/^#\/?/, "");
-        const note = window.document.getElementById(id);
-        if (note !== null) {
-          try {
-            const html = processXRef(id, note.cloneNode(true));
-            instance.setContent(html);
-          } finally {
-            instance.enable();
-            instance.show();
-          }
-        } else {
-          // See if we can fetch this
-          fetch(url.split('#')[0])
-          .then(res => res.text())
-          .then(html => {
-            const parser = new DOMParser();
-            const htmlDoc = parser.parseFromString(html, "text/html");
-            const note = htmlDoc.getElementById(id);
-            if (note !== null) {
-              const html = processXRef(id, note);
-              instance.setContent(html);
-            } 
-          }).finally(() => {
-            instance.enable();
-            instance.show();
-          });
-        }
-      } else {
-        // See if we can fetch a full url (with no hash to target)
-        // This is a special case and we should probably do some content thinning / targeting
-        fetch(url)
-        .then(res => res.text())
-        .then(html => {
-          const parser = new DOMParser();
-          const htmlDoc = parser.parseFromString(html, "text/html");
-          const note = htmlDoc.querySelector('main.content');
-          if (note !== null) {
-            // This should only happen for chapter cross references
-            // (since there is no id in the URL)
-            // remove the first header
-            if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
-              note.children[0].remove();
-            }
-            const html = processXRef(null, note);
-            instance.setContent(html);
-          } 
-        }).finally(() => {
-          instance.enable();
-          instance.show();
-        });
-      }
-    }, function(instance) {
-    });
-  }
-      let selectedAnnoteEl;
-      const selectorForAnnotation = ( cell, annotation) => {
-        let cellAttr = 'data-code-cell="' + cell + '"';
-        let lineAttr = 'data-code-annotation="' +  annotation + '"';
-        const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
-        return selector;
-      }
-      const selectCodeLines = (annoteEl) => {
-        const doc = window.document;
-        const targetCell = annoteEl.getAttribute("data-target-cell");
-        const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
-        const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
-        const lines = annoteSpan.getAttribute("data-code-lines").split(",");
-        const lineIds = lines.map((line) => {
-          return targetCell + "-" + line;
-        })
-        let top = null;
-        let height = null;
-        let parent = null;
-        if (lineIds.length > 0) {
-            //compute the position of the single el (top and bottom and make a div)
-            const el = window.document.getElementById(lineIds[0]);
-            top = el.offsetTop;
-            height = el.offsetHeight;
-            parent = el.parentElement.parentElement;
-          if (lineIds.length > 1) {
-            const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
-            const bottom = lastEl.offsetTop + lastEl.offsetHeight;
-            height = bottom - top;
-          }
-          if (top !== null && height !== null && parent !== null) {
-            // cook up a div (if necessary) and position it 
-            let div = window.document.getElementById("code-annotation-line-highlight");
-            if (div === null) {
-              div = window.document.createElement("div");
-              div.setAttribute("id", "code-annotation-line-highlight");
-              div.style.position = 'absolute';
-              parent.appendChild(div);
-            }
-            div.style.top = top - 2 + "px";
-            div.style.height = height + 4 + "px";
-            div.style.left = 0;
-            let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
-            if (gutterDiv === null) {
-              gutterDiv = window.document.createElement("div");
-              gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
-              gutterDiv.style.position = 'absolute';
-              const codeCell = window.document.getElementById(targetCell);
-              const gutter = codeCell.querySelector('.code-annotation-gutter');
-              gutter.appendChild(gutterDiv);
-            }
-            gutterDiv.style.top = top - 2 + "px";
-            gutterDiv.style.height = height + 4 + "px";
-          }
-          selectedAnnoteEl = annoteEl;
-        }
-      };
-      const unselectCodeLines = () => {
-        const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
-        elementsIds.forEach((elId) => {
-          const div = window.document.getElementById(elId);
-          if (div) {
-            div.remove();
-          }
-        });
-        selectedAnnoteEl = undefined;
-      };
-        // Handle positioning of the toggle
-    window.addEventListener(
-      "resize",
-      throttle(() => {
-        elRect = undefined;
-        if (selectedAnnoteEl) {
-          selectCodeLines(selectedAnnoteEl);
-        }
-      }, 10)
-    );
-    function throttle(fn, ms) {
-    let throttle = false;
-    let timer;
-      return (...args) => {
-        if(!throttle) { // first call gets through
-            fn.apply(this, args);
-            throttle = true;
-        } else { // all the others get throttled
-            if(timer) clearTimeout(timer); // cancel #2
-            timer = setTimeout(() => {
-              fn.apply(this, args);
-              timer = throttle = false;
-            }, ms);
-        }
-      };
-    }
-      // Attach click handler to the DT
-      const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
-      for (const annoteDlNode of annoteDls) {
-        annoteDlNode.addEventListener('click', (event) => {
-          const clickedEl = event.target;
-          if (clickedEl !== selectedAnnoteEl) {
-            unselectCodeLines();
-            const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
-            if (activeEl) {
-              activeEl.classList.remove('code-annotation-active');
-            }
-            selectCodeLines(clickedEl);
-            clickedEl.classList.add('code-annotation-active');
-          } else {
-            // Unselect the line
-            unselectCodeLines();
-            clickedEl.classList.remove('code-annotation-active');
-          }
-        });
-      }
-  const findCites = (el) => {
-    const parentEl = el.parentElement;
-    if (parentEl) {
-      const cites = parentEl.dataset.cites;
-      if (cites) {
-        return {
-          el,
-          cites: cites.split(' ')
-        };
-      } else {
-        return findCites(el.parentElement)
-      }
-    } else {
-      return undefined;
-    }
-  };
-  var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
-  for (var i=0; i<bibliorefs.length; i++) {
-    const ref = bibliorefs[i];
-    const citeInfo = findCites(ref);
-    if (citeInfo) {
-      tippyHover(citeInfo.el, function() {
-        var popup = window.document.createElement('div');
-        citeInfo.cites.forEach(function(cite) {
-          var citeDiv = window.document.createElement('div');
-          citeDiv.classList.add('hanging-indent');
-          citeDiv.classList.add('csl-entry');
-          var biblioDiv = window.document.getElementById('ref-' + cite);
-          if (biblioDiv) {
-            citeDiv.innerHTML = biblioDiv.innerHTML;
-          }
-          popup.appendChild(citeDiv);
-        });
-        return popup.innerHTML;
-      });
-    }
-  }
-});
-</script><div class="modal fade" id="quarto-embedded-source-code-modal" tabindex="-1" aria-labelledby="quarto-embedded-source-code-modal-label" aria-hidden="true"><div class="modal-dialog modal-dialog-scrollable"><div class="modal-content"><div class="modal-header"><h5 class="modal-title" id="quarto-embedded-source-code-modal-label">Source Code</h5><button class="btn-close" data-bs-dismiss="modal"></button></div><div class="modal-body"><div class="">
-<div class="sourceCode" id="cb8" data-shortcodes="false"><pre class="sourceCode markdown code-with-copy"><code class="sourceCode markdown"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
-<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="an">title:</span><span class="co"> Feature Engineering</span></span>
-<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="an">execute:</span></span>
-<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a><span class="co">  echo: true</span></span>
-<span id="cb8-5"><a href="#cb8-5" aria-hidden="true" tabindex="-1"></a><span class="co">  warning: false</span></span>
-<span id="cb8-6"><a href="#cb8-6" aria-hidden="true" tabindex="-1"></a><span class="an">format:</span></span>
-<span id="cb8-7"><a href="#cb8-7" aria-hidden="true" tabindex="-1"></a><span class="co">  html:</span></span>
-<span id="cb8-8"><a href="#cb8-8" aria-hidden="true" tabindex="-1"></a><span class="co">    code-fold: false</span></span>
-<span id="cb8-9"><a href="#cb8-9" aria-hidden="true" tabindex="-1"></a><span class="co">    code-tools: true</span></span>
-<span id="cb8-10"><a href="#cb8-10" aria-hidden="true" tabindex="-1"></a><span class="co">    toc: true</span></span>
-<span id="cb8-11"><a href="#cb8-11" aria-hidden="true" tabindex="-1"></a><span class="co">    toc-title: Feature Engineering</span></span>
-<span id="cb8-12"><a href="#cb8-12" aria-hidden="true" tabindex="-1"></a><span class="co">    page-layout: full</span></span>
-<span id="cb8-13"><a href="#cb8-13" aria-hidden="true" tabindex="-1"></a><span class="co">    theme:</span></span>
-<span id="cb8-14"><a href="#cb8-14" aria-hidden="true" tabindex="-1"></a><span class="co">      - cosmo</span></span>
-<span id="cb8-15"><a href="#cb8-15" aria-hidden="true" tabindex="-1"></a><span class="co">      - cerulean</span></span>
-<span id="cb8-16"><a href="#cb8-16" aria-hidden="true" tabindex="-1"></a><span class="co">    callout-icon: false</span></span>
-<span id="cb8-17"><a href="#cb8-17" aria-hidden="true" tabindex="-1"></a><span class="an">jupyter:</span><span class="co"> python3</span></span>
-<span id="cb8-18"><a href="#cb8-18" aria-hidden="true" tabindex="-1"></a><span class="co">---</span></span>
-<span id="cb8-19"><a href="#cb8-19" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-20"><a href="#cb8-20" aria-hidden="true" tabindex="-1"></a>::: {.callout-note collapse="false"}</span>
-<span id="cb8-21"><a href="#cb8-21" aria-hidden="true" tabindex="-1"></a><span class="fu">## Learning Outcomes</span></span>
-<span id="cb8-22"><a href="#cb8-22" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Recognize the value of feature engineering as a tool to improve model performance</span>
-<span id="cb8-23"><a href="#cb8-23" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Implement polynomial feature generation and one hot encoding</span>
-<span id="cb8-24"><a href="#cb8-24" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Understand the interactions between model complexity, model variance, and training error</span>
-<span id="cb8-25"><a href="#cb8-25" aria-hidden="true" tabindex="-1"></a>:::</span>
-<span id="cb8-26"><a href="#cb8-26" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-27"><a href="#cb8-27" aria-hidden="true" tabindex="-1"></a>At this point, we've grown quite familiar with the modeling process. We've introduced the concept of loss, used it to fit several types of models, and, most recently, extended our analysis to multiple regression. Along the way, we've forged our way through the mathematics of deriving the optimal model parameters in all its gory detail. It's time to make our lives a little easier – let's implement the modeling process in code!</span>
-<span id="cb8-28"><a href="#cb8-28" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-29"><a href="#cb8-29" aria-hidden="true" tabindex="-1"></a>In this lecture, we'll explore two techniques for model fitting:</span>
-<span id="cb8-30"><a href="#cb8-30" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-31"><a href="#cb8-31" aria-hidden="true" tabindex="-1"></a><span class="ss">1. </span>Translating our derived formulas for regression to <span class="in">`python`</span></span>
-<span id="cb8-32"><a href="#cb8-32" aria-hidden="true" tabindex="-1"></a><span class="ss">2. </span>Using <span class="in">`python`</span>'s <span class="in">`sklearn`</span> package</span>
-<span id="cb8-33"><a href="#cb8-33" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-34"><a href="#cb8-34" aria-hidden="true" tabindex="-1"></a>With our new programming frameworks in hand, we will also add sophistication to our models by introducing more complex features to enhance model performance. </span>
-<span id="cb8-35"><a href="#cb8-35" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-36"><a href="#cb8-36" aria-hidden="true" tabindex="-1"></a><span class="fu">## Feature Engineering</span></span>
-<span id="cb8-37"><a href="#cb8-37" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-38"><a href="#cb8-38" aria-hidden="true" tabindex="-1"></a>At this point in the course, we've equipped ourselves with some powerful techniques to build and optimize models. We've explored how to develop models of multiple variables, as well as how to transform variables to help **linearize** a dataset and fit these models to maximize their performance.</span>
-<span id="cb8-39"><a href="#cb8-39" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-40"><a href="#cb8-40" aria-hidden="true" tabindex="-1"></a>All of this was done with one major caveat: the regression models we've worked with so far are all **linear in the input variables**. We've assumed that our predictions should be some combination of linear variables. While this works well in some cases, the real world isn't always so straightforward. We'll learn an important method to address this issue – feature engineering – and consider some new problems that can arise when we do so.</span>
-<span id="cb8-41"><a href="#cb8-41" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-42"><a href="#cb8-42" aria-hidden="true" tabindex="-1"></a>Feature engineering is the process of *transforming* raw features into *more informative features* that can be used in modeling or EDA tasks and improve model performance.</span>
-<span id="cb8-43"><a href="#cb8-43" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-44"><a href="#cb8-44" aria-hidden="true" tabindex="-1"></a>Feature engineering allows you to:</span>
-<span id="cb8-45"><a href="#cb8-45" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-46"><a href="#cb8-46" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Capture domain knowledge </span>
-<span id="cb8-47"><a href="#cb8-47" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Express non-linear relationships using linear models</span>
-<span id="cb8-48"><a href="#cb8-48" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Use non-numeric (qualitative) features in models</span>
-<span id="cb8-49"><a href="#cb8-49" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-50"><a href="#cb8-50" aria-hidden="true" tabindex="-1"></a><span class="fu">## Feature Functions</span></span>
-<span id="cb8-51"><a href="#cb8-51" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-52"><a href="#cb8-52" aria-hidden="true" tabindex="-1"></a>A **feature function** describes the transformations we apply to raw features in a dataset to create a design matrix of transformed features. We typically denote the feature function as $\Phi$ (think to yourself: "phi"-true function). When we apply the feature function to our original dataset $\mathbb{X}$, the result, $\Phi(\mathbb{X})$, is a transformed design matrix ready to be used in modeling. </span>
-<span id="cb8-53"><a href="#cb8-53" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-54"><a href="#cb8-54" aria-hidden="true" tabindex="-1"></a>For example, we might design a feature function that computes the square of an existing feature and adds it to the design matrix. In this case, our existing matrix $<span class="co">[</span><span class="ot">x</span><span class="co">]</span>$ is transformed to $<span class="co">[</span><span class="ot">x, x^2</span><span class="co">]</span>$. Its *dimension* increases from 1 to 2. Often, the dimension of the *featurized* dataset increases as seen here.</span>
-<span id="cb8-55"><a href="#cb8-55" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-56"><a href="#cb8-56" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/phi.png" alt='phi' width='700'&gt;&lt;/center&gt;</span>
-<span id="cb8-57"><a href="#cb8-57" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-58"><a href="#cb8-58" aria-hidden="true" tabindex="-1"></a>The new features introduced by the feature function can then be used in modeling. Often, we use the symbol $\phi_i$ to represent transformed features after feature engineering. </span>
-<span id="cb8-59"><a href="#cb8-59" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-60"><a href="#cb8-60" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_1 x + \theta_2 x^2$$</span>
-<span id="cb8-61"><a href="#cb8-61" aria-hidden="true" tabindex="-1"></a>$$\hat{y}= \theta_1 \phi_1 + \theta_2 \phi_2$$</span>
-<span id="cb8-62"><a href="#cb8-62" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-63"><a href="#cb8-63" aria-hidden="true" tabindex="-1"></a>In matrix notation, the symbol $\Phi$ is sometimes used to denote the design matrix after feature engineering has been performed. Note that in the usage below, $\Phi$ is now a feature-engineered matrix, rather than a function.</span>
-<span id="cb8-64"><a href="#cb8-64" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-65"><a href="#cb8-65" aria-hidden="true" tabindex="-1"></a>$$\hat{\mathbb{Y}} = \Phi \theta$$</span>
-<span id="cb8-66"><a href="#cb8-66" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-67"><a href="#cb8-67" aria-hidden="true" tabindex="-1"></a>More formally, we describe a feature function as transforming the original $\mathbb{R}^{n \times p}$ dataset $\mathbb{X}$ to a featurized $\mathbb{R}^{n \times p'}$ dataset $\mathbb{\Phi}$, where $p'$ is typically greater than $p$. </span>
-<span id="cb8-68"><a href="#cb8-68" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-69"><a href="#cb8-69" aria-hidden="true" tabindex="-1"></a>$$\mathbb{X} \in \mathbb{R}^{n \times p} \longrightarrow \Phi \in \mathbb{R}^{n \times p'}$$</span>
-<span id="cb8-70"><a href="#cb8-70" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-71"><a href="#cb8-71" aria-hidden="true" tabindex="-1"></a><span class="fu">## One Hot Encoding</span></span>
-<span id="cb8-72"><a href="#cb8-72" aria-hidden="true" tabindex="-1"></a>Feature engineering opens up a whole new set of possibilities for designing better-performing models. As you will see in lab and homework, feature engineering is one of the most important parts of the entire modeling process.</span>
-<span id="cb8-73"><a href="#cb8-73" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-74"><a href="#cb8-74" aria-hidden="true" tabindex="-1"></a>A particularly powerful use of feature engineering is to allow us to perform regression on *non-numeric* features. **One hot encoding** is a feature engineering technique that generates numeric features from categorical data, allowing us to use our usual methods to fit a regression model on the data. </span>
-<span id="cb8-75"><a href="#cb8-75" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-76"><a href="#cb8-76" aria-hidden="true" tabindex="-1"></a>To illustrate how this works, we'll refer back to the <span class="in">`tips`</span> dataset from previous lectures. Consider the <span class="in">`"day"`</span> column of the dataset:</span>
-<span id="cb8-77"><a href="#cb8-77" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-80"><a href="#cb8-80" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb8-81"><a href="#cb8-81" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
-<span id="cb8-82"><a href="#cb8-82" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
-<span id="cb8-83"><a href="#cb8-83" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> numpy <span class="im">as</span> np</span>
-<span id="cb8-84"><a href="#cb8-84" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> seaborn <span class="im">as</span> sns</span>
-<span id="cb8-85"><a href="#cb8-85" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
-<span id="cb8-86"><a href="#cb8-86" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> sklearn.linear_model <span class="im">as</span> lm</span>
-<span id="cb8-87"><a href="#cb8-87" aria-hidden="true" tabindex="-1"></a>tips <span class="op">=</span> sns.load_dataset(<span class="st">"tips"</span>)</span>
-<span id="cb8-88"><a href="#cb8-88" aria-hidden="true" tabindex="-1"></a>tips.head()</span>
-<span id="cb8-89"><a href="#cb8-89" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb8-90"><a href="#cb8-90" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-91"><a href="#cb8-91" aria-hidden="true" tabindex="-1"></a>   At first glance, it doesn't seem possible to fit a regression model to this data – we can't directly perform any mathematical operations on the entry "Sun". </span>
-<span id="cb8-92"><a href="#cb8-92" aria-hidden="true" tabindex="-1"></a>  </span>
-<span id="cb8-93"><a href="#cb8-93" aria-hidden="true" tabindex="-1"></a>To resolve this, we instead create a new table with a feature for each unique value in the original <span class="in">`"day"`</span> column. We then iterate through the <span class="in">`"day"`</span> column. For each entry in <span class="in">`"day"`</span> we fill the corresponding feature in the new table with 1. All other features are set to 0.</span>
-<span id="cb8-94"><a href="#cb8-94" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-95"><a href="#cb8-95" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/ohe.png" alt='ohe' width='600'&gt;&lt;/center&gt;</span>
-<span id="cb8-96"><a href="#cb8-96" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-97"><a href="#cb8-97" aria-hidden="true" tabindex="-1"></a>&lt;br&gt; </span>
-<span id="cb8-98"><a href="#cb8-98" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-99"><a href="#cb8-99" aria-hidden="true" tabindex="-1"></a>In short, each category of a categorical variable gets its own feature</span>
-<span id="cb8-100"><a href="#cb8-100" aria-hidden="true" tabindex="-1"></a>&lt;ul&gt;</span>
-<span id="cb8-101"><a href="#cb8-101" aria-hidden="true" tabindex="-1"></a>   &lt;li&gt;</span>
-<span id="cb8-102"><a href="#cb8-102" aria-hidden="true" tabindex="-1"></a>      Value = 1 if a row belongs to the category</span>
-<span id="cb8-103"><a href="#cb8-103" aria-hidden="true" tabindex="-1"></a>   &lt;/li&gt;</span>
-<span id="cb8-104"><a href="#cb8-104" aria-hidden="true" tabindex="-1"></a>   &lt;li&gt;</span>
-<span id="cb8-105"><a href="#cb8-105" aria-hidden="true" tabindex="-1"></a>      Value = 0 otherwise</span>
-<span id="cb8-106"><a href="#cb8-106" aria-hidden="true" tabindex="-1"></a>   &lt;/li&gt;</span>
-<span id="cb8-107"><a href="#cb8-107" aria-hidden="true" tabindex="-1"></a>&lt;/ul&gt;</span>
-<span id="cb8-108"><a href="#cb8-108" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-109"><a href="#cb8-109" aria-hidden="true" tabindex="-1"></a>The <span class="in">`OneHotEncoder`</span> class of <span class="in">`sklearn`</span> (<span class="co">[</span><span class="ot">documentation</span><span class="co">](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder.get_feature_names_out)</span>) offers a quick way to perform this one-hot encoding. You will explore its use in detail in the lab. For now, recognize that we follow a very similar workflow to when we were working with the <span class="in">`LinearRegression`</span> class: we initialize a <span class="in">`OneHotEncoder`</span> object, fit it to our data, and finally use <span class="in">`.transform`</span> to apply the fitted encoder.</span>
-<span id="cb8-110"><a href="#cb8-110" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-113"><a href="#cb8-113" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb8-114"><a href="#cb8-114" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: false</span></span>
-<span id="cb8-115"><a href="#cb8-115" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
-<span id="cb8-116"><a href="#cb8-116" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.preprocessing <span class="im">import</span> OneHotEncoder</span>
-<span id="cb8-117"><a href="#cb8-117" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-118"><a href="#cb8-118" aria-hidden="true" tabindex="-1"></a><span class="co"># Initialize a OneHotEncoder object</span></span>
-<span id="cb8-119"><a href="#cb8-119" aria-hidden="true" tabindex="-1"></a>ohe <span class="op">=</span> OneHotEncoder()</span>
-<span id="cb8-120"><a href="#cb8-120" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-121"><a href="#cb8-121" aria-hidden="true" tabindex="-1"></a><span class="co"># Fit the encoder</span></span>
-<span id="cb8-122"><a href="#cb8-122" aria-hidden="true" tabindex="-1"></a>ohe.fit(tips[[<span class="st">"day"</span>]])</span>
-<span id="cb8-123"><a href="#cb8-123" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-124"><a href="#cb8-124" aria-hidden="true" tabindex="-1"></a><span class="co"># Use the encoder to transform the raw "day" feature</span></span>
-<span id="cb8-125"><a href="#cb8-125" aria-hidden="true" tabindex="-1"></a>encoded_day <span class="op">=</span> ohe.transform(tips[[<span class="st">"day"</span>]]).toarray()</span>
-<span id="cb8-126"><a href="#cb8-126" aria-hidden="true" tabindex="-1"></a>encoded_day_df <span class="op">=</span> pd.DataFrame(encoded_day, columns<span class="op">=</span>ohe.get_feature_names_out())</span>
-<span id="cb8-127"><a href="#cb8-127" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-128"><a href="#cb8-128" aria-hidden="true" tabindex="-1"></a>encoded_day_df.head()</span>
-<span id="cb8-129"><a href="#cb8-129" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb8-130"><a href="#cb8-130" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-131"><a href="#cb8-131" aria-hidden="true" tabindex="-1"></a>The one-hot encoded features can then be used in the design matrix to train a model:</span>
-<span id="cb8-132"><a href="#cb8-132" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-133"><a href="#cb8-133" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/ohemodel.png" alt='ohemodel' width='600'&gt;&lt;/center&gt;</span>
-<span id="cb8-134"><a href="#cb8-134" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-135"><a href="#cb8-135" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_1 (\text{total}<span class="sc">\_</span>\text{bill}) + \theta_2 (\text{size}) + \theta_3 (\text{day}<span class="sc">\_</span>\text{Fri}) + \theta_4 (\text{day}<span class="sc">\_</span>\text{Sat}) + \theta_5 (\text{day}<span class="sc">\_</span>\text{Sun}) + \theta_6 (\text{day}<span class="sc">\_</span>\text{Thur})$$</span>
-<span id="cb8-136"><a href="#cb8-136" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-137"><a href="#cb8-137" aria-hidden="true" tabindex="-1"></a>Or in shorthand:</span>
-<span id="cb8-138"><a href="#cb8-138" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-139"><a href="#cb8-139" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_{1}\phi_{1} + \theta_{2}\phi_{2} + \theta_{3}\phi_{3} + \theta_{4}\phi_{4} + \theta_{5}\phi_{5} + \theta_{6}\phi_{6}$$</span>
-<span id="cb8-140"><a href="#cb8-140" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-141"><a href="#cb8-141" aria-hidden="true" tabindex="-1"></a>Now, the <span class="in">`day`</span> feature (or rather, the four new boolean features that represent day) can be used to fit a model.</span>
-<span id="cb8-142"><a href="#cb8-142" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-143"><a href="#cb8-143" aria-hidden="true" tabindex="-1"></a>Using <span class="in">`sklearn`</span> to fit the new model, we can determine the model coefficients, allowing us to understand how each feature impacts the predicted tip.</span>
-<span id="cb8-144"><a href="#cb8-144" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-147"><a href="#cb8-147" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb8-148"><a href="#cb8-148" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
-<span id="cb8-149"><a href="#cb8-149" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> sklearn.linear_model <span class="im">import</span> LinearRegression</span>
-<span id="cb8-150"><a href="#cb8-150" aria-hidden="true" tabindex="-1"></a>data_w_ohe <span class="op">=</span> tips[[<span class="st">"total_bill"</span>, <span class="st">"size"</span>, <span class="st">"day"</span>]].join(encoded_day_df).drop(columns <span class="op">=</span> <span class="st">"day"</span>)</span>
-<span id="cb8-151"><a href="#cb8-151" aria-hidden="true" tabindex="-1"></a>ohe_model <span class="op">=</span> lm.LinearRegression(fit_intercept<span class="op">=</span><span class="va">False</span>) <span class="co">#Tell sklearn to not add an additional bias column. Why?</span></span>
-<span id="cb8-152"><a href="#cb8-152" aria-hidden="true" tabindex="-1"></a>ohe_model.fit(data_w_ohe, tips[<span class="st">"tip"</span>])</span>
-<span id="cb8-153"><a href="#cb8-153" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-154"><a href="#cb8-154" aria-hidden="true" tabindex="-1"></a>pd.DataFrame({<span class="st">"Feature"</span>:data_w_ohe.columns, <span class="st">"Model Coefficient"</span>:ohe_model.coef_})</span>
-<span id="cb8-155"><a href="#cb8-155" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb8-156"><a href="#cb8-156" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-157"><a href="#cb8-157" aria-hidden="true" tabindex="-1"></a>For example, when looking at the coefficient for <span class="in">`day_Fri`</span>, we can understand how much the fact that it is Friday impacts the predicted tip. </span>
-<span id="cb8-158"><a href="#cb8-158" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-159"><a href="#cb8-159" aria-hidden="true" tabindex="-1"></a>When one-hot encoding, keep in mind that any set of one-hot encoded columns will always sum to a column of all ones, representing the bias column. More formally, the bias column is a linear combination of the OHE columns.</span>
-<span id="cb8-160"><a href="#cb8-160" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-161"><a href="#cb8-161" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/bias.png" alt='bias' width='600'&gt;&lt;/center&gt;</span>
-<span id="cb8-162"><a href="#cb8-162" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-163"><a href="#cb8-163" aria-hidden="true" tabindex="-1"></a>We must be careful not to include this bias column in our design matrix. Otherwise, there will be linear dependence in the model, meaning $\mathbb{X}^{\top}\mathbb{X}$ would no longer be invertible, and our OLS estimate $\hat{\theta} = (\mathbb{X}^{\top}\mathbb{X})^{-1}\mathbb{X}^{\top}\mathbb{Y}$ fails.</span>
-<span id="cb8-164"><a href="#cb8-164" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-165"><a href="#cb8-165" aria-hidden="true" tabindex="-1"></a>To resolve this issue, we simply omit one of the one-hot encoded columns *or* do not include an intercept term. The adjusted design matrices are shown below.</span>
-<span id="cb8-166"><a href="#cb8-166" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-167"><a href="#cb8-167" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/remove.png" alt='remove' width='600'&gt;&lt;/center&gt;</span>
-<span id="cb8-168"><a href="#cb8-168" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-169"><a href="#cb8-169" aria-hidden="true" tabindex="-1"></a>Either approach works — we still retain the same information as the omitted column being a linear combination of the remaining columns.</span>
-<span id="cb8-170"><a href="#cb8-170" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-171"><a href="#cb8-171" aria-hidden="true" tabindex="-1"></a><span class="fu">## Polynomial Features</span></span>
-<span id="cb8-172"><a href="#cb8-172" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-173"><a href="#cb8-173" aria-hidden="true" tabindex="-1"></a>We have encountered a few cases now where models with linear features have performed poorly on datasets that show clear non-linear curvature. </span>
-<span id="cb8-174"><a href="#cb8-174" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-175"><a href="#cb8-175" aria-hidden="true" tabindex="-1"></a>As an example, consider the <span class="in">`vehicles`</span> dataset, which contains information about cars. Suppose we want to use the <span class="in">`hp`</span> (horsepower) of a car to predict its <span class="in">`"mpg"`</span> (gas mileage in miles per gallon). If we visualize the relationship between these two variables, we see a non-linear curvature. Fitting a linear model to these variables results in a high (poor) value of RMSE. </span>
-<span id="cb8-176"><a href="#cb8-176" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-177"><a href="#cb8-177" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_0 + \theta_1 (\text{hp})$$</span>
-<span id="cb8-178"><a href="#cb8-178" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-181"><a href="#cb8-181" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb8-182"><a href="#cb8-182" aria-hidden="true" tabindex="-1"></a><span class="co">#| code-fold: true</span></span>
-<span id="cb8-183"><a href="#cb8-183" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
-<span id="cb8-184"><a href="#cb8-184" aria-hidden="true" tabindex="-1"></a>pd.options.mode.chained_assignment <span class="op">=</span> <span class="va">None</span> </span>
-<span id="cb8-185"><a href="#cb8-185" aria-hidden="true" tabindex="-1"></a>vehicles <span class="op">=</span> sns.load_dataset(<span class="st">"mpg"</span>).dropna().rename(columns <span class="op">=</span> {<span class="st">"horsepower"</span>: <span class="st">"hp"</span>}).sort_values(<span class="st">"hp"</span>)</span>
-<span id="cb8-186"><a href="#cb8-186" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-187"><a href="#cb8-187" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> vehicles[[<span class="st">"hp"</span>]]</span>
-<span id="cb8-188"><a href="#cb8-188" aria-hidden="true" tabindex="-1"></a>Y <span class="op">=</span> vehicles[<span class="st">"mpg"</span>]</span>
-<span id="cb8-189"><a href="#cb8-189" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-190"><a href="#cb8-190" aria-hidden="true" tabindex="-1"></a>hp_model <span class="op">=</span> lm.LinearRegression()</span>
-<span id="cb8-191"><a href="#cb8-191" aria-hidden="true" tabindex="-1"></a>hp_model.fit(X, Y)</span>
-<span id="cb8-192"><a href="#cb8-192" aria-hidden="true" tabindex="-1"></a>hp_model_predictions <span class="op">=</span> hp_model.predict(X)</span>
-<span id="cb8-193"><a href="#cb8-193" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-194"><a href="#cb8-194" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> matplotlib.pyplot <span class="im">as</span> plt</span>
-<span id="cb8-195"><a href="#cb8-195" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-196"><a href="#cb8-196" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(data<span class="op">=</span>vehicles, x<span class="op">=</span><span class="st">"hp"</span>, y<span class="op">=</span><span class="st">"mpg"</span>)</span>
-<span id="cb8-197"><a href="#cb8-197" aria-hidden="true" tabindex="-1"></a>plt.plot(vehicles[<span class="st">"hp"</span>], hp_model_predictions, c<span class="op">=</span><span class="st">"tab:red"</span>)<span class="op">;</span></span>
-<span id="cb8-198"><a href="#cb8-198" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-199"><a href="#cb8-199" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"MSE of model with (hp) feature: </span><span class="sc">{</span>np<span class="sc">.</span>mean((Y<span class="op">-</span>hp_model_predictions)<span class="op">**</span><span class="dv">2</span>)<span class="sc">}</span><span class="ss">"</span>)</span>
-<span id="cb8-200"><a href="#cb8-200" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb8-201"><a href="#cb8-201" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-202"><a href="#cb8-202" aria-hidden="true" tabindex="-1"></a>To capture non-linearity in a dataset, it makes sense to incorporate **non-linear** features. Let's introduce a **polynomial** term, $\text{hp}^2$, into our regression model. The model now takes the form:</span>
-<span id="cb8-203"><a href="#cb8-203" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-204"><a href="#cb8-204" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2)$$</span>
-<span id="cb8-205"><a href="#cb8-205" aria-hidden="true" tabindex="-1"></a>$$\hat{y} = \theta_0 + \theta_1 \phi_1 + \theta_2 \phi_2$$</span>
-<span id="cb8-206"><a href="#cb8-206" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-207"><a href="#cb8-207" aria-hidden="true" tabindex="-1"></a>How can we fit a model with non-linear features? We can use the exact same techniques as before: ordinary least squares, gradient descent, or <span class="in">`sklearn`</span>. This is because our new model is still a **linear model**. Although it contains non-linear *features*, it is linear with respect to the model *parameters*. All of our previous work on fitting models was done under the assumption that we were working with linear models. Because our new model is still linear, we can apply our existing methods to determine the optimal parameters. </span>
-<span id="cb8-208"><a href="#cb8-208" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-211"><a href="#cb8-211" aria-hidden="true" tabindex="-1"></a><span class="in">```{python}</span></span>
-<span id="cb8-212"><a href="#cb8-212" aria-hidden="true" tabindex="-1"></a><span class="co">#| vscode: {languageId: python}</span></span>
-<span id="cb8-213"><a href="#cb8-213" aria-hidden="true" tabindex="-1"></a><span class="co"># Add a hp^2 feature to the design matrix</span></span>
-<span id="cb8-214"><a href="#cb8-214" aria-hidden="true" tabindex="-1"></a>X <span class="op">=</span> vehicles[[<span class="st">"hp"</span>]]</span>
-<span id="cb8-215"><a href="#cb8-215" aria-hidden="true" tabindex="-1"></a>X[<span class="st">"hp^2"</span>] <span class="op">=</span> vehicles[<span class="st">"hp"</span>]<span class="op">**</span><span class="dv">2</span></span>
-<span id="cb8-216"><a href="#cb8-216" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-217"><a href="#cb8-217" aria-hidden="true" tabindex="-1"></a><span class="co"># Use sklearn to fit the model</span></span>
-<span id="cb8-218"><a href="#cb8-218" aria-hidden="true" tabindex="-1"></a>hp2_model <span class="op">=</span> lm.LinearRegression()</span>
-<span id="cb8-219"><a href="#cb8-219" aria-hidden="true" tabindex="-1"></a>hp2_model.fit(X, Y)</span>
-<span id="cb8-220"><a href="#cb8-220" aria-hidden="true" tabindex="-1"></a>hp2_model_predictions <span class="op">=</span> hp2_model.predict(X)</span>
-<span id="cb8-221"><a href="#cb8-221" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-222"><a href="#cb8-222" aria-hidden="true" tabindex="-1"></a>sns.scatterplot(data<span class="op">=</span>vehicles, x<span class="op">=</span><span class="st">"hp"</span>, y<span class="op">=</span><span class="st">"mpg"</span>)</span>
-<span id="cb8-223"><a href="#cb8-223" aria-hidden="true" tabindex="-1"></a>plt.plot(vehicles[<span class="st">"hp"</span>], hp2_model_predictions, c<span class="op">=</span><span class="st">"tab:red"</span>)<span class="op">;</span></span>
-<span id="cb8-224"><a href="#cb8-224" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-225"><a href="#cb8-225" aria-hidden="true" tabindex="-1"></a><span class="bu">print</span>(<span class="ss">f"MSE of model with (hp^2) feature: </span><span class="sc">{</span>np<span class="sc">.</span>mean((Y<span class="op">-</span>hp2_model_predictions)<span class="op">**</span><span class="dv">2</span>)<span class="sc">}</span><span class="ss">"</span>)</span>
-<span id="cb8-226"><a href="#cb8-226" aria-hidden="true" tabindex="-1"></a><span class="in">```</span></span>
-<span id="cb8-227"><a href="#cb8-227" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-228"><a href="#cb8-228" aria-hidden="true" tabindex="-1"></a>Looking a lot better! By incorporating a squared feature, we are able to capture the curvature of the dataset. Our model is now a parabola centered on our data. Notice that our new model's error has decreased relative to the original model with linear features.</span>
-<span id="cb8-229"><a href="#cb8-229" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-230"><a href="#cb8-230" aria-hidden="true" tabindex="-1"></a><span class="fu">## Complexity and Overfitting</span></span>
-<span id="cb8-231"><a href="#cb8-231" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-232"><a href="#cb8-232" aria-hidden="true" tabindex="-1"></a>We've seen now that feature engineering allows us to build all sorts of features to improve the performance of the model. In particular, we saw that designing a more complex feature (squaring <span class="in">`hp`</span> in the <span class="in">`vehicles`</span> data previously) substantially improved the model's ability to capture non-linear relationships. To take full advantage of this, we might be inclined to design increasingly complex features. Consider the following three models, each of different order (the maximum exponent power of each model):</span>
-<span id="cb8-233"><a href="#cb8-233" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-234"><a href="#cb8-234" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Model with order 2: $\hat{\text{mpg}} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2)$</span>
-<span id="cb8-235"><a href="#cb8-235" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Model with order 3: $\hat{\text{mpg}} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2) + \theta_3 (\text{hp}^3)$</span>
-<span id="cb8-236"><a href="#cb8-236" aria-hidden="true" tabindex="-1"></a><span class="ss">* </span>Model with order 4: $\hat{\text{mpg}} = \theta_0 + \theta_1 (\text{hp}) + \theta_2 (\text{hp}^2) + \theta_3 (\text{hp}^3) + \theta_4 (\text{hp}^4)$</span>
-<span id="cb8-237"><a href="#cb8-237" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-238"><a href="#cb8-238" aria-hidden="true" tabindex="-1"></a>&lt;br/&gt;</span>
-<span id="cb8-239"><a href="#cb8-239" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-240"><a href="#cb8-240" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/degree_comparison.png" alt='degree_comparison' width='900'&gt;&lt;/center&gt;</span>
-<span id="cb8-241"><a href="#cb8-241" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-242"><a href="#cb8-242" aria-hidden="true" tabindex="-1"></a>As we can see in the plots above, MSE continues to decrease with each additional polynomial term. To visualize it further, let's plot models as the complexity increases from 0 to 6: </span>
-<span id="cb8-243"><a href="#cb8-243" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-244"><a href="#cb8-244" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/degree_comparison2.png" alt='degree_comparison' width='900'&gt;&lt;/center&gt;</span>
-<span id="cb8-245"><a href="#cb8-245" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-246"><a href="#cb8-246" aria-hidden="true" tabindex="-1"></a>When we use our model to make predictions on the same data that was used to fit the model, we find that the MSE decreases with each additional polynomial term (as our model gets more complex). The **training error** is the model's error when generating predictions from the same data that was used for training purposes. We can conclude that the training error goes down as the complexity of the model increases. </span>
-<span id="cb8-247"><a href="#cb8-247" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-248"><a href="#cb8-248" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/train_error.png" alt='train_error' width='400'&gt;&lt;/center&gt;</span>
-<span id="cb8-249"><a href="#cb8-249" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-250"><a href="#cb8-250" aria-hidden="true" tabindex="-1"></a>This seems like good news – when working on the **training data**, we can improve model performance by designing increasingly complex models. </span>
-<span id="cb8-251"><a href="#cb8-251" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-252"><a href="#cb8-252" aria-hidden="true" tabindex="-1"></a><span class="at">&gt;**Math Fact**: given $N$ overlapping data points, we can always find a polynomial of degree $N-1$ that goes through all those points.</span></span>
-<span id="cb8-253"><a href="#cb8-253" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; </span></span>
-<span id="cb8-254"><a href="#cb8-254" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; For example: there always exists a degree-4 polynomial curve that can perfectly model a dataset of 5 datapoints</span></span>
-<span id="cb8-255"><a href="#cb8-255" aria-hidden="true" tabindex="-1"></a><span class="at">&gt; </span>&lt;center&gt;&lt;img src="images/perfect_poly_fits.png" alt='train_error' width='600'&gt;&lt;/center&gt;</span>
-<span id="cb8-256"><a href="#cb8-256" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-257"><a href="#cb8-257" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-258"><a href="#cb8-258" aria-hidden="true" tabindex="-1"></a>However, high model complexity comes with its own set of issues. When building the <span class="in">`vehicles`</span> models above, we trained the models on the *entire* dataset and then evaluated their performance on this same dataset. In reality, we are likely to instead train the model on a *sample* from the population, then use it to make predictions on data it didn't encounter during training. </span>
-<span id="cb8-259"><a href="#cb8-259" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-260"><a href="#cb8-260" aria-hidden="true" tabindex="-1"></a>Let's walk through a more realistic example. Say we are given a training dataset of just 6 datapoints and want to train a model to then make predictions on a *different* set of points. We may be tempted to make a highly complex model (e.g., degree 5), especially given it makes perfect predictions on the training data as clear on the left. However, as shown in the graph on the right, this model would perform *horribly* on the rest of the population! </span>
-<span id="cb8-261"><a href="#cb8-261" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-262"><a href="#cb8-262" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/complex.png" alt='complex' width='600'&gt;&lt;/center&gt;</span>
-<span id="cb8-263"><a href="#cb8-263" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-264"><a href="#cb8-264" aria-hidden="true" tabindex="-1"></a>The phenomenon above is called **overfitting**. The model effectively just memorized the training data it encountered when it was fitted, leaving it unable to **generalize** well to data it didn't encounter during training. This is a problem: we want models that are generalizable to “unseen” data.</span>
-<span id="cb8-265"><a href="#cb8-265" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-266"><a href="#cb8-266" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-267"><a href="#cb8-267" aria-hidden="true" tabindex="-1"></a>Additionally, since complex models are sensitive to the specific dataset used to train them, they have high **variance**. A model with high variance tends to *vary* more dramatically when trained on different datasets. Going back to our example above, we can see our degree-5 model varies erratically when we fit it to different samples of 6 points from <span class="in">`vehicles`</span>. </span>
-<span id="cb8-268"><a href="#cb8-268" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-269"><a href="#cb8-269" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/resamples.png" alt='resamples' width='800'&gt;&lt;/center&gt;</span>
-<span id="cb8-270"><a href="#cb8-270" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-271"><a href="#cb8-271" aria-hidden="true" tabindex="-1"></a>We now face a dilemma: we know that we can **decrease training error** by increasing model complexity, but models that are *too* complex start to overfit and can't be reapplied to new datasets due to **high variance**.</span>
-<span id="cb8-272"><a href="#cb8-272" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-273"><a href="#cb8-273" aria-hidden="true" tabindex="-1"></a>&lt;center&gt;&lt;img src="images/bvt.png" alt='bvt' width='400'&gt;&lt;/center&gt;</span>
-<span id="cb8-274"><a href="#cb8-274" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-275"><a href="#cb8-275" aria-hidden="true" tabindex="-1"></a>We can see that there is a clear trade-off that comes from the complexity of our model. As model complexity increases, the model's error on the training data decreases. At the same time, the model's variance tends to increase.</span>
-<span id="cb8-276"><a href="#cb8-276" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-277"><a href="#cb8-277" aria-hidden="true" tabindex="-1"></a>The takeaway here: we need to strike a balance in the complexity of our models; we want models that are generalizable to "unseen" data. A model that is too simple won't be able to capture the key relationships between our variables of interest; a model that is too complex runs the risk of overfitting. </span>
-<span id="cb8-278"><a href="#cb8-278" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb8-279"><a href="#cb8-279" aria-hidden="true" tabindex="-1"></a>This begs the question: how do we control the complexity of a model? Stay tuned for our Lecture 17 on Cross-Validation and Regularization!</span>
-</code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-</div></div></div></div></div>
-</div> <!-- /content -->
-
-
-
-
-</body></html>
\ No newline at end of file
diff --git a/gradient_descent/gradient_descent.ipynb b/gradient_descent/gradient_descent.ipynb
new file mode 100644
index 00000000..64106e4e
--- /dev/null
+++ b/gradient_descent/gradient_descent.ipynb
@@ -0,0 +1,965 @@
+{
+  "cells": [
+    {
+      "cell_type": "raw",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "title: sklearn and Gradient Descent\n",
+        "execute:\n",
+        "  echo: true\n",
+        "  warning: false\n",
+        "format:\n",
+        "  html:\n",
+        "    code-fold: false\n",
+        "    code-tools: true\n",
+        "    toc: true\n",
+        "    toc-title: sklearn and Gradient Descent\n",
+        "    page-layout: full\n",
+        "    theme:\n",
+        "      - cosmo\n",
+        "      - cerulean\n",
+        "    callout-icon: false\n",
+        "---"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "::: {.callout-note collapse=\"false\"}\n",
+        "## Learning Outcomes\n",
+        "* Apply the `sklearn` library for model creation and training\n",
+        "* Optimizing complex models \n",
+        "* Identifying cases where straight calculus or geometric arguments won't help solve the loss function\n",
+        "* Applying gradient descent for numerical optimization\n",
+        ":::"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "#| code-fold: true\n",
+        "import pandas as pd\n",
+        "import seaborn as sns\n",
+        "import plotly.express as px\n",
+        "import matplotlib.pyplot as plt\n",
+        "import numpy as np\n",
+        "from sklearn.linear_model import LinearRegression\n",
+        "pd.options.mode.chained_assignment = None  # default='warn'"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## `sklearn`\n",
+        "### Implementing Derived Formulas in Code\n",
+        "\n",
+        "Throughout this lecture, we'll refer to the `penguins` dataset. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "#| code-fold: true\n",
+        "import pandas as pd\n",
+        "import seaborn as sns\n",
+        "import numpy as np\n",
+        "\n",
+        "penguins = sns.load_dataset(\"penguins\")\n",
+        "penguins = penguins[penguins[\"species\"] == \"Adelie\"].dropna()\n",
+        "penguins.head()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Our goal will be to predict the value of the `\"bill_depth_mm\"` for a particular penguin given its `\"flipper_length_mm\"` and `\"body_mass_g\"`. We'll also add a bias column of all ones to represent the intercept term of our models."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "# Add a bias column of all ones to `penguins`\n",
+        "penguins[\"bias\"] = np.ones(len(penguins), dtype=int) \n",
+        "\n",
+        "# Define the design matrix, X...\n",
+        "# Note that we use .to_numpy() to convert our DataFrame into a NumPy array so it's in Matrix form\n",
+        "X = penguins[[\"bias\", \"flipper_length_mm\", \"body_mass_g\"]].to_numpy()\n",
+        "\n",
+        "# ...as well as the target variable, Y\n",
+        "# Again, we use .to_numpy() to convert our DataFrame into a NumPy array so it's in Matrix form\n",
+        "Y = penguins[[\"bill_depth_mm\"]].to_numpy()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "In the lecture on ordinary least squares, we expressed multiple linear regression using matrix notation.\n",
+        "\n",
+        "$$\\hat{\\mathbb{Y}} = \\mathbb{X}\\theta$$\n",
+        "\n",
+        "We used a geometric approach to derive the following expression for the optimal model parameters:\n",
+        "\n",
+        "$$\\hat{\\theta} = (\\mathbb{X}^T \\mathbb{X})^{-1}\\mathbb{X}^T \\mathbb{Y}$$\n",
+        "\n",
+        "That's a whole lot of matrix manipulation. How do we implement it in `python`?\n",
+        "\n",
+        "There are three operations we need to perform here: multiplying matrices, taking transposes, and finding inverses. \n",
+        "\n",
+        "* To perform matrix multiplication, use the `@` operator\n",
+        "* To take a transpose, call the `.T` attribute of an `NumPy` array or `DataFrame`\n",
+        "* To compute an inverse, use `NumPy`'s in-built method `np.linalg.inv`\n",
+        "\n",
+        "Putting this all together, we can compute the OLS estimate for the optimal model parameters, stored in the array `theta_hat`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "#| code-fold: false\n",
+        "theta_hat = np.linalg.inv(X.T @ X) @ X.T @ Y\n",
+        "theta_hat"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "To make predictions using our optimized parameter values, we matrix-multiply the design matrix with the parameter vector:\n",
+        "\n",
+        "$$\\hat{\\mathbb{Y}} = \\mathbb{X}\\theta$$"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "#| code-fold: false\n",
+        "Y_hat = X @ theta_hat\n",
+        "pd.DataFrame(Y_hat).head()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### The `sklearn` Workflow\n",
+        "We've already saved a lot of time (and avoided tedious calculations) by translating our derived formulas into code. However, we still had to go through the process of writing out the linear algebra ourselves. \n",
+        "\n",
+        "To make life *even easier*, we can turn to the `sklearn` [`python` library](https://scikit-learn.org/stable/). `sklearn` is a robust library of machine learning tools used extensively in research and industry. It is the standard for simple machine learning tasks and gives us a wide variety of in-built modeling frameworks and methods, so we'll keep returning to `sklearn` techniques as we progress through Data 100. \n",
+        "\n",
+        "Regardless of the specific type of model being implemented, `sklearn` follows a standard set of steps for creating a model: \n",
+        "\n",
+        "1. Import the `LinearRegression` model from `sklearn`\n",
+        "\n",
+        "    ```\n",
+        "    from sklearn.linear_model import LinearRegression\n",
+        "    ```\n",
+        "\n",
+        "2. Create a model object. This generates a new instance of the model class. You can think of it as making a new \"copy\" of a standard \"template\" for a model. In code, this looks like:\n",
+        "\n",
+        "    ```\n",
+        "    my_model = LinearRegression()\n",
+        "    ```\n",
+        "\n",
+        "    \n",
+        "3. Fit the model to the `X` design matrix and `Y` target vector. This calculates the optimal model parameters \"behind the scenes\" without us explicitly working through the calculations ourselves. The fitted parameters are then stored within the model for use in future predictions:\n",
+        "\n",
+        "    ```\n",
+        "    my_model.fit(X, Y)\n",
+        "     ```\n",
+        "\n",
+        "    \n",
+        "4. Use the fitted model to make predictions on the `X` input data using `.predict`. \n",
+        "\n",
+        "    ```\n",
+        "    my_model.predict(X)\n",
+        "    ```\n",
+        "\n",
+        "To extract the fitted parameters, we can use:\n",
+        "\n",
+        "  ```\n",
+        "  my_model.coef_\n",
+        "\n",
+        "  my_model.intercept_\n",
+        "  ```\n",
+        "\n",
+        "\n",
+        "Let's put this into action with our multiple regression task!\n",
+        "\n",
+        "**1. Initialize an instance of the model class**\n",
+        "\n",
+        "`sklearn` stores \"templates\" of useful models for machine learning. We begin the modeling process by making a \"copy\" of one of these templates for our own use. Model initialization looks like `ModelClass()`, where `ModelClass` is the type of model we wish to create.\n",
+        "\n",
+        "For now, let's create a linear regression model using `LinearRegression`. \n",
+        "\n",
+        "`my_model` is now an instance of the `LinearRegression` class. You can think of it as the \"idea\" of a linear regression model. We haven't trained it yet, so it doesn't know any model parameters and cannot be used to make predictions. In fact, we haven't even told it what data to use for modeling! It simply waits for further instructions."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "my_model = LinearRegression()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "**2. Train the model using `.fit`**\n",
+        "\n",
+        "Before the model can make predictions, we will need to fit it to our training data. When we fit the model, `sklearn` will run gradient descent behind the scenes to determine the optimal model parameters. It will then save these model parameters to our model instance for future use. \n",
+        "\n",
+        "All `sklearn` model classes include a `.fit` method, which is used to fit the model. It takes in two inputs: the design matrix, `X`, and the target variable, `Y`. \n",
+        "\n",
+        "Let's start by fitting a model with just one feature: the flipper length. We create a design matrix `X` by pulling out the `\"flipper_length_mm\"` column from the `DataFrame`. "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "# .fit expects a 2D data design matrix, so we use double brackets to extract a DataFrame\n",
+        "X = penguins[[\"flipper_length_mm\"]]\n",
+        "Y = penguins[\"bill_depth_mm\"]\n",
+        "\n",
+        "my_model.fit(X, Y)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Notice that we use **double brackets** to extract this column. Why double brackets instead of just single brackets? The `.fit` method, by default, expects to receive **2-dimensional** data – some kind of data that includes both rows and columns. Writing `penguins[\"flipper_length_mm\"]` would return a 1D `Series`, causing `sklearn` to error. We avoid this by writing `penguins[[\"flipper_length_mm\"]]` to produce a 2D `DataFrame`. \n",
+        "\n",
+        "And in just three lines of code, our model has run gradient descent to determine the optimal model parameters! Our single-feature model takes the form:\n",
+        "\n",
+        "$$\\text{bill depth} = \\theta_0 + \\theta_1 \\text{flipper length}$$\n",
+        "\n",
+        "Note that `LinearRegression` will automatically include an intercept term. \n",
+        "\n",
+        "The fitted model parameters are stored as attributes of the model instance. `my_model.intercept_` will return the value of $\\hat{\\theta}_0$ as a scalar. `my_model.coef_` will return all values $\\hat{\\theta}_1, \n",
+        "\\hat{\\theta}_1, ...$ in an array. Because our model only contains one feature, we see just the value of $\\hat{\\theta}_1$ in the cell below."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "# The intercept term, theta_0\n",
+        "my_model.intercept_"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "# All parameters theta_1, ..., theta_p\n",
+        "my_model.coef_"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "**3. Use the fitted model to make predictions**\n",
+        "\n",
+        "Now that the model has been trained, we can use it to make predictions! To do so, we use the `.predict` method. `.predict` takes in one argument: the design matrix that should be used to generate predictions. To understand how the model performs on the training set, we would pass in the training data. Alternatively, to make predictions on unseen data, we would pass in a new dataset that wasn't used to train the model.\n",
+        "\n",
+        "Below, we call `.predict` to generate model predictions on the original training data. As before, we use double brackets to ensure that we extract 2-dimensional data."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "Y_hat_one_feature = my_model.predict(penguins[[\"flipper_length_mm\"]])\n",
+        "\n",
+        "print(f\"The RMSE of the model is {np.sqrt(np.mean((Y-Y_hat_one_feature)**2))}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "What if we wanted a model with two features? \n",
+        "\n",
+        "$$\\text{bill depth} = \\theta_0 + \\theta_1 \\text{flipper length} + \\theta_2 \\text{body mass}$$\n",
+        "\n",
+        "We repeat this three-step process by intializing a new model object, then calling `.fit` and `.predict` as before."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "# Step 1: initialize LinearRegression model\n",
+        "two_feature_model = LinearRegression()\n",
+        "\n",
+        "# Step 2: fit the model\n",
+        "X_two_features = penguins[[\"flipper_length_mm\", \"body_mass_g\"]]\n",
+        "Y = penguins[\"bill_depth_mm\"]\n",
+        "\n",
+        "two_feature_model.fit(X_two_features, Y)\n",
+        "\n",
+        "# Step 3: make predictions\n",
+        "Y_hat_two_features = two_feature_model.predict(X_two_features)\n",
+        "\n",
+        "print(f\"The RMSE of the model is {np.sqrt(np.mean((Y-Y_hat_two_features)**2))}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We can also see that we obtain the same predictions using `sklearn` as we did when applying the ordinary least squares formula before! "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "#| code-fold: true\n",
+        "pd.DataFrame({\"Y_hat from OLS\":np.squeeze(Y_hat), \"Y_hat from sklearn\":Y_hat_two_features}).head()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Gradient Descent \n",
+        "\n",
+        "At this point, we've grown quite familiar with the process of choosing a model and a corresponding loss function and optimizing parameters by choosing the values of $\\theta$ that minimize the loss function. So far, we've optimized $\\theta$ by\n",
+        "\n",
+        "1. Using calculus to take the derivative of the loss function with respect to $\\theta$, setting it equal to 0, and solving for $\\theta$.\n",
+        "2. Using the geometric argument of orthogonality to derive the OLS solution $\\hat{\\theta} = (\\mathbb{X}^T \\mathbb{X})^{-1}\\mathbb{X}^T \\mathbb{Y}$.\n",
+        "\n",
+        "One thing to note, however, is that the techniques we used above can only be applied if we make some big assumptions. For the calculus approach, we assumed that the loss function was differentiable at all points and that we could algebraically solve for the zero points of the derivative; for the geometric approach, OLS *only* applies when using a linear model with MSE loss. What happens when we have more complex models with different, more complex loss functions? The techniques we've learned so far will not work, so we need a new optimization technique: **gradient descent**. \n",
+        "\n",
+        "> **BIG IDEA**: use an iterative algorithm to numerically compute the minimum of the loss.\n",
+        "\n",
+        "### Minimizing an Arbitrary 1D Function\n",
+        "\n",
+        "Let's consider an arbitrary function. Our goal is to find the value of $x$ that minimizes this function."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "def arbitrary(x):\n",
+        "    return (x**4 - 15*x**3 + 80*x**2 - 180*x + 144)/10"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "<img src=\"images/arbitrary.png\" alt='arbitrary' width='600'>\n",
+        "\n",
+        "#### The Naive Approach: Guess and Check\n",
+        "\n",
+        "Above, we saw that the minimum is somewhere around 5.3. Let's see if we can figure out how to find the exact minimum algorithmically from scratch. One very slow (and terrible) way would be manual guess-and-check."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "arbitrary(6)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "A somewhat better (but still slow) approach is to use brute force to try out a bunch of x values and return the one that yields the lowest loss."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "def simple_minimize(f, xs):\n",
+        "    # Takes in a function f and a set of values xs. \n",
+        "    # Calculates the value of the function f at all values x in xs\n",
+        "    # Takes the minimum value of f(x) and returns the corresponding value x \n",
+        "    y = [f(x) for x in xs]  \n",
+        "    return xs[np.argmin(y)]\n",
+        "\n",
+        "guesses = [5.3, 5.31, 5.32, 5.33, 5.34, 5.35]\n",
+        "simple_minimize(arbitrary, guesses)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "This process is essentially the same as before where we made a graphical plot, it's just that we're only looking at 20 selected points."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "#| code-fold: true\n",
+        "xs = np.linspace(1, 7, 200)\n",
+        "sparse_xs = np.linspace(1, 7, 5)\n",
+        "\n",
+        "ys = arbitrary(xs)\n",
+        "sparse_ys = arbitrary(sparse_xs)\n",
+        "\n",
+        "fig = px.line(x = xs, y = arbitrary(xs))\n",
+        "fig.add_scatter(x = sparse_xs, y = arbitrary(sparse_xs), mode = \"markers\")\n",
+        "fig.update_layout(showlegend= False)\n",
+        "fig.update_layout(autosize=False, width=800, height=600)\n",
+        "fig.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "This basic approach suffers from three major flaws:\n",
+        "\n",
+        "1. If the minimum is outside our range of guesses, the answer will be completely wrong.\n",
+        "2. Even if our range of guesses is correct, if the guesses are too coarse, our answer will be inaccurate.\n",
+        "3. It is *very* computationally inefficient, considering potentially vast numbers of guesses that are useless.\n",
+        "\n",
+        "#### Scipy.optimize.minimize\n",
+        "\n",
+        "One way to minimize this mathematical function is to use the `scipy.optimize.minimize` function. It takes a function and a starting guess and tries to find the minimum."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "from scipy.optimize import minimize\n",
+        "\n",
+        "# takes a function f and a starting point x0 and returns a readout \n",
+        "# with the optimal input value of x which minimizes f\n",
+        "minimize(arbitrary, x0 = 3.5)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "`scipy.optimize.minimize` is great. It may also seem a bit magical. How could you write a function that can find the minimum of any mathematical function? There are a number of ways to do this, which we'll explore in today's lecture, eventually arriving at the important idea of **gradient descent**, which is the principle that `scipy.optimize.minimize` uses.\n",
+        "\n",
+        "It turns out that under the hood, the `fit` method for `LinearRegression` models uses gradient descent. Gradient descent is also how much of machine learning works, including even advanced neural network models. \n",
+        "\n",
+        "In Data 100, the gradient descent process will usually be invisible to us, hidden beneath an abstraction layer. However, to be good data scientists, it's important that we know the underlying principles that optimization functions harness to find optimal parameters.\n",
+        "\n",
+        "\n",
+        "#### Digging into Gradient Descent\n",
+        "Looking at the function across this domain, it is clear that the function's minimum value occurs around $\\theta = 5.3$. Let's pretend for a moment that we *couldn't* see the full view of the cost function. How would we guess the value of $\\theta$ that minimizes the function? \n",
+        "\n",
+        "It turns out that the first derivative of the function can give us a clue. In the plots below, the line indicates the value of the derivative of each value of $\\theta$. The derivative is negative where it is red and positive where it is green.\n",
+        "\n",
+        "\n",
+        "Say we make a guess for the minimizing value of $\\theta$. Remember that we read plots from left to right, and assume that our starting $\\theta$ value is to the left of the optimal $\\hat{\\theta}$. If the guess \"undershoots\" the true minimizing value – our guess for $\\theta$ is lower than the value of the $\\hat{\\theta}$ that minimizes the function – the derivative will be **negative**. This means that if we increase $\\theta$ (move further to the right), then we **can decrease** our loss function further. If this guess \"overshoots\" the true minimizing value, the derivative will be positive, implying the converse.\n",
+        "\n",
+        "<div align=\"middle\">\n",
+        "  <table style=\"width:100%\">\n",
+        "    <tr align=\"center\">\n",
+        "      <td><img src=\"images/step.png\" alt='step' width='600'>\n",
+        "      </td>\n",
+        "    </tr>\n",
+        "  </table>\n",
+        "</div>\n",
+        "\n",
+        "We can use this pattern to help formulate our next guess for the optimal $\\hat{\\theta}$. Consider the case where we've undershot $\\theta$ by guessing too low of a value. We'll want our next guess to be greater in value than our previous guess – that is, we want to shift our guess to the right. You can think of this as following the slope \"downhill\" to the function's minimum value.\n",
+        "\n",
+        "<div align=\"middle\">\n",
+        "  <table style=\"width:100%\">\n",
+        "    <tr align=\"center\">\n",
+        "      <td><img src=\"images/neg_step.png\" alt='neg_step' width='600'>\n",
+        "      </td>\n",
+        "    </tr>\n",
+        "  </table>\n",
+        "</div>\n",
+        "\n",
+        "If we've overshot $\\hat{\\theta}$ by guessing too high of a value, we'll want our next guess to be lower in value – we want to shift our guess for $\\hat{\\theta}$ to the left. \n",
+        "\n",
+        "<div align=\"middle\">\n",
+        "  <table style=\"width:100%\">\n",
+        "    <tr align=\"center\">\n",
+        "      <td><img src=\"images/pos_step.png\" alt='pos_step' width='600'>\n",
+        "      </td>\n",
+        "    </tr>\n",
+        "  </table>\n",
+        "</div>\n",
+        "\n",
+        "In other words, the derivative of the function at each point tells us the direction of our next guess.\n",
+        "\n",
+        "* A negative slope means we want to step to the right, or move in the *positive* direction. \n",
+        "* A positive slope means we want to step to the left, or move in the *negative* direction.\n",
+        "\n",
+        "#### Algorithm Attempt 1\n",
+        "Armed with this knowledge, let's try to see if we can use the derivative to optimize the function.\n",
+        "\n",
+        "We start by making some guess for the minimizing value of $x$. Then, we look at the derivative of the function at this value of $x$, and step downhill in the *opposite* direction. We can express our new rule as a recurrence relation:\n",
+        "\n",
+        "$$x^{(t+1)} = x^{(t)} - \\frac{d}{dx} f(x^{(t)})$$\n",
+        "\n",
+        "Translating this statement into English: we obtain **our next guess** for the minimizing value of $x$ at timestep $t+1$ ($x^{(t+1)}$) by taking **our last guess** ($x^{(t)}$) and subtracting the **derivative of the function** at that point ($\\frac{d}{dx} f(x^{(t)})$).\n",
+        "\n",
+        "A few steps are shown below, where the old step is shown as a transparent point, and the next step taken is the green-filled dot.\n",
+        "\n",
+        "<div align=\"middle\">\n",
+        "  <table style=\"width:100%\">\n",
+        "    <tr align=\"center\">\n",
+        "      <td><img src=\"images/grad_descent_1.png\" alt='grad_descent_2' width='800'>\n",
+        "      </td>\n",
+        "    </tr>\n",
+        "  </table>\n",
+        "</div>\n",
+        "\n",
+        "Looking pretty good! We do have a problem though – once we arrive close to the minimum value of the function, our guesses \"bounce\" back and forth past the minimum without ever reaching it.\n",
+        "\n",
+        "<div align=\"middle\">\n",
+        "  <table style=\"width:100%\">\n",
+        "    <tr align=\"center\">\n",
+        "      <td><img src=\"images/grad_descent_2.png\" alt='grad_descent_2' width='500'>\n",
+        "      </td>\n",
+        "    </tr>\n",
+        "  </table>\n",
+        "</div>\n",
+        "\n",
+        "In other words, each step we take when updating our guess moves us too far. We can address this by decreasing the size of each step. \n",
+        "\n",
+        "#### Algorithm Attempt 2\n",
+        "Let's update our algorithm to use a **learning rate** (also sometimes called the step size), which controls how far we move with each update. We represent the learning rate with $\\alpha$. \n",
+        "\n",
+        "$$x^{(t+1)} = x^{(t)} - \\alpha \\frac{d}{dx} f(x^{(t)})$$\n",
+        "\n",
+        "A small $\\alpha$ means that we will take small steps; a large $\\alpha$ means we will take large steps. When do we stop updating? We stop updating either after a fixed number of updates or after a subsequent update doesn't change much.\n",
+        "\n",
+        "Updating our function to use $\\alpha=0.3$, our algorithm successfully **converges** (settles on a solution and stops updating significantly, or at all) on the minimum value.\n",
+        "\n",
+        "<div align=\"middle\">\n",
+        "  <table style=\"width:100%\">\n",
+        "    <tr align=\"center\">\n",
+        "      <td><img src=\"images/grad_descent_3.png\" alt='grad_descent_3' width='500'>\n",
+        "      </td>\n",
+        "    </tr>\n",
+        "  </table>\n",
+        "</div>\n",
+        "\n",
+        "### Convexity\n",
+        "In our analysis above, we focused our attention on the global minimum of the loss function. You may be wondering: what about the local minimum that's just to the left? \n",
+        "\n",
+        "If we had chosen a different starting guess for $\\theta$, or a different value for the learning rate $\\alpha$, our algorithm may have gotten \"stuck\" and converged on the local minimum, rather than on the true optimum value of loss. \n",
+        "\n",
+        "<div align=\"middle\">\n",
+        "  <table style=\"width:100%\">\n",
+        "    <tr align=\"center\">\n",
+        "      <td><img src=\"images/local.png\" alt='local' width='600'>\n",
+        "      </td>\n",
+        "    </tr>\n",
+        "  </table>\n",
+        "</div>\n",
+        "\n",
+        "If the loss function is **convex**, gradient descent is guaranteed to converge and find the global minimum of the objective function. Formally, a function $f$ is convex if:\n",
+        "$$tf(a) + (1-t)f(b) \\geq f(ta + (1-t)b)$$\n",
+        "for all $a, b$ in the domain of $f$ and $t \\in [0, 1]$.\n",
+        "\n",
+        "To put this into words: if you drew a line between any two points on the curve, all values on the curve must be *on or below* the line. Importantly, any local minimum of a convex function is also its global minimum so we avoid the situation where the algorithm converges on some critical point that is not the minimum of the function.\n",
+        "\n",
+        "<div align=\"middle\">\n",
+        "  <table style=\"width:100%\">\n",
+        "    <tr align=\"center\">\n",
+        "      <td><img src=\"images/convex.png\" alt='convex' width='600'>\n",
+        "      </td>\n",
+        "    </tr>\n",
+        "  </table>\n",
+        "</div>\n",
+        "\n",
+        "In summary, non-convex loss functions can cause problems with optimization. This means that our choice of loss function is a key factor in our modeling process. It turns out that MSE *is* convex, which is a major reason why it is such a popular choice of loss function. Gradient descent is only guaranteed to converge (given enough iterations and an appropriate step size) for convex functions.\n",
+        "\n",
+        "### Gradient Descent in 1 Dimension\n",
+        "\n",
+        "> **Terminology clarification**: In past lectures, we have used “loss” to refer to the error incurred on a *single* datapoint. In applications, we usually care more about the average error across *all* datapoints. Going forward, we will take the “model’s loss” to mean the model’s average error across the dataset. This is sometimes also known as the empirical risk, cost function, or objective function. $$L(\\theta) = R(\\theta) = \\frac{1}{n} \\sum_{i=1}^{n} L(y, \\hat{y})$$\n",
+        "\n",
+        "In our discussion above, we worked with some arbitrary function $f$. As data scientists, we will almost always work with gradient descent in the context of optimizing *models* – specifically, we want to apply gradient descent to find the minimum of a *loss function*. In a modeling context, our goal is to minimize a loss function by choosing the minimizing model *parameters*.\n",
+        "\n",
+        "Recall our modeling workflow from the past few lectures: \n",
+        "\n",
+        "1. Define a model with some parameters $\\theta_i$\n",
+        "2. Choose a loss function \n",
+        "3. Select the values of $\\theta_i$ that minimize the loss function on the data\n",
+        "\n",
+        "Gradient descent is a powerful technique for completing this last task. By applying the gradient descent algorithm, we can select values for our parameters $\\theta_i$ that will lead to the model having minimal loss on the training data.\n",
+        "\n",
+        "When using gradient descent in a modeling context, we:\n",
+        "\n",
+        "1. Make guesses for the minimizing $\\theta_i$\n",
+        "2. Compute the derivative of the loss function $L$\n",
+        "\n",
+        "We can \"translate\" our gradient descent rule from before by replacing $x$ with $\\theta$ and $f$ with $L$:\n",
+        "\n",
+        "$$\\theta^{(t+1)} = \\theta^{(t)} - \\alpha \\frac{d}{d\\theta} L(\\theta^{(t)})$$\n",
+        "\n",
+        "#### Gradient Descent on the `tips` Dataset \n",
+        "To see this in action, let's consider a case where we have a linear model with no offset. We want to predict the tip (y) given the price of a meal (x). To do this, we\n",
+        "\n",
+        "* Choose a model: $\\hat{y} = \\theta_1 x$,\n",
+        "* Choose a loss function: $L(\\theta) = MSE(\\theta) = \\frac{1}{n} \\sum_{i=1}^n (y_i - \\theta_1x_i)^2$.\n",
+        "\n",
+        "Let's apply our `gradient_descent` function from before to optimize our model on the `tips` dataset. We will try to select the best parameter $\\theta_i$ to predict the `tip` $y$ from the `total_bill` $x$."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "df = sns.load_dataset(\"tips\")\n",
+        "df.head()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We can visualize the value of the MSE on our dataset for different possible choices of $\\theta_1$. To optimize our model, we want to select the value of $\\theta_1$ that leads to the lowest MSE."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "#| code-fold: true\n",
+        "import plotly.graph_objects as go\n",
+        "\n",
+        "def derivative_arbitrary(x):\n",
+        "    return (4*x**3 - 45*x**2 + 160*x - 180)/10\n",
+        "\n",
+        "fig = go.Figure()\n",
+        "roots = np.array([2.3927, 3.5309, 5.3263])\n",
+        "\n",
+        "fig.add_trace(go.Scatter(x = xs, y = arbitrary(xs), \n",
+        "                         mode = \"lines\", name = \"f\"))\n",
+        "fig.add_trace(go.Scatter(x = xs, y = derivative_arbitrary(xs), \n",
+        "                         mode = \"lines\", name = \"df\", line = {\"dash\": \"dash\"}))\n",
+        "fig.add_trace(go.Scatter(x = np.array(roots), y = 0*roots, \n",
+        "                         mode = \"markers\", name = \"df = zero\", marker_size = 12))\n",
+        "fig.update_layout(font_size = 20, yaxis_range=[-1, 3])\n",
+        "fig.update_layout(autosize=False, width=800, height=600)\n",
+        "fig.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "To apply gradient descent, we need to compute the derivative of the loss function with respect to our parameter $\\theta_1$.\n",
+        "\n",
+        "* Given our loss function, $$L(\\theta) = MSE(\\theta) = \\frac{1}{n} \\sum_{i=1}^n (y_i - \\theta_1x_i)^2$$\n",
+        "* We take the derivative with respect to $\\theta_1$ $$\\frac{\\partial}{\\partial \\theta_{1}} L(\\theta_1^{(t)}) = \\frac{-2}{n} \\sum_{i=1}^n (y_i - \\theta_1^{(t)} x_i) x_i$$\n",
+        "* Which results in the gradient descent update rule\n",
+        "$$\\theta_1^{(t+1)} = \\theta_1^{(t)} - \\alpha \\frac{d}{d\\theta}L(\\theta_1^{(t)})$$\n",
+        "\n",
+        "for some learning rate $\\alpha$.\n",
+        "\n",
+        "Implementing this in code, we can visualize the MSE loss on the `tips` data. **MSE is convex**, so there is one global minimum."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "#| code-fold: true\n",
+        "def gradient_descent(df, initial_guess, alpha, n):\n",
+        "    \"\"\"Performs n steps of gradient descent on df using learning rate alpha starting\n",
+        "       from initial_guess. Returns a numpy array of all guesses over time.\"\"\"\n",
+        "    guesses = [initial_guess]\n",
+        "    current_guess = initial_guess\n",
+        "    while len(guesses) < n:\n",
+        "        current_guess = current_guess - alpha * df(current_guess)\n",
+        "        guesses.append(current_guess)\n",
+        "        \n",
+        "    return np.array(guesses)\n",
+        "\n",
+        "def mse_single_arg(theta_1):\n",
+        "    \"\"\"Returns the MSE on our data for the given theta1\"\"\"\n",
+        "    x = df[\"total_bill\"]\n",
+        "    y_obs = df[\"tip\"]\n",
+        "    y_hat = theta_1 * x\n",
+        "    return np.mean((y_hat - y_obs) ** 2)\n",
+        "\n",
+        "def mse_loss_derivative_single_arg(theta_1):\n",
+        "    \"\"\"Returns the derivative of the MSE on our data for the given theta1\"\"\"\n",
+        "    x = df[\"total_bill\"]\n",
+        "    y_obs = df[\"tip\"]\n",
+        "    y_hat = theta_1 * x\n",
+        "    \n",
+        "    return np.mean(2 * (y_hat - y_obs) * x)\n",
+        "\n",
+        "loss_df = pd.DataFrame({\"theta_1\":np.linspace(-1.5, 1), \"MSE\":[mse_single_arg(theta_1) for theta_1 in np.linspace(-1.5, 1)]})\n",
+        "\n",
+        "trajectory = gradient_descent(mse_loss_derivative_single_arg, -0.5, 0.0001, 100)\n",
+        "\n",
+        "plt.plot(loss_df[\"theta_1\"], loss_df[\"MSE\"])\n",
+        "plt.scatter(trajectory, [mse_single_arg(guess) for guess in trajectory], c=\"white\", edgecolor=\"firebrick\")\n",
+        "plt.scatter(trajectory[-1], mse_single_arg(trajectory[-1]), c=\"firebrick\")\n",
+        "plt.xlabel(r\"$\\theta_1$\")\n",
+        "plt.ylabel(r\"$L(\\theta_1)$\");\n",
+        "\n",
+        "print(f\"Final guess for theta_1: {trajectory[-1]}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Gradient Descent on Multi-Dimensional Models\n",
+        "The function we worked with above was one-dimensional – we were only minimizing the function with respect to a single parameter, $\\theta$. However, models usually have a cost function with multiple parameters that need to be optimized. For example, simple linear regression has 2 parameters: \n",
+        "$$\\hat{y} + \\theta_0 + \\theta_1x$$ \n",
+        "and multiple linear regression has $p+1$ parameters: \n",
+        "$$\\mathbb{Y} = \\theta_0 + \\theta_1 \\Bbb{X}_{:,1} + \\theta_2 \\Bbb{X}_{:,2} + \\cdots + \\theta_p \\Bbb{X}_{:,p}$$\n",
+        "\n",
+        "We'll need to expand gradient descent so we can update our guesses for all model parameters all in one go.\n",
+        "\n",
+        "With multiple parameters to optimize, we consider a **loss surface**, or the model's loss for a particular *combination* of possible parameter values."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "#| code-fold: true\n",
+        "import plotly.graph_objects as go\n",
+        "\n",
+        "\n",
+        "def mse_loss(theta, X, y_obs):\n",
+        "    y_hat = X @ theta\n",
+        "    return np.mean((y_hat - y_obs) ** 2)    \n",
+        "\n",
+        "tips_with_bias = df.copy()\n",
+        "tips_with_bias[\"bias\"] = 1\n",
+        "tips_with_bias = tips_with_bias[[\"bias\", \"total_bill\"]]\n",
+        "\n",
+        "uvalues = np.linspace(0, 2, 10)\n",
+        "vvalues = np.linspace(-0.1, 0.35, 10)\n",
+        "(u,v) = np.meshgrid(uvalues, vvalues)\n",
+        "thetas = np.vstack((u.flatten(),v.flatten()))\n",
+        "\n",
+        "def mse_loss_single_arg(theta):\n",
+        "    return mse_loss(theta, tips_with_bias, df[\"tip\"])\n",
+        "\n",
+        "MSE = np.array([mse_loss_single_arg(t) for t in thetas.T])\n",
+        "\n",
+        "loss_surface = go.Surface(x=u, y=v, z=np.reshape(MSE, u.shape))\n",
+        "\n",
+        "ind = np.argmin(MSE)\n",
+        "optimal_point = go.Scatter3d(name = \"Optimal Point\",\n",
+        "    x = [thetas.T[ind,0]], y = [thetas.T[ind,1]], \n",
+        "    z = [MSE[ind]],\n",
+        "    marker=dict(size=10, color=\"red\"))\n",
+        "\n",
+        "fig = go.Figure(data=[loss_surface, optimal_point])\n",
+        "fig.update_layout(scene = dict(\n",
+        "    xaxis_title = \"theta0\",\n",
+        "    yaxis_title = \"theta1\",\n",
+        "    zaxis_title = \"MSE\"), autosize=False, width=800, height=600)\n",
+        "\n",
+        "fig.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We can also visualize a bird's-eye view of the loss surface from above using a contour plot:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "#| code-fold: true\n",
+        "contour = go.Contour(x=u[0], y=v[:, 0], z=np.reshape(MSE, u.shape))\n",
+        "fig = go.Figure(contour)\n",
+        "fig.update_layout(\n",
+        "    xaxis_title = \"theta0\",\n",
+        "    yaxis_title = \"theta1\", autosize=False, width=800, height=600)\n",
+        "\n",
+        "fig.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "#### The Gradient Vector\n",
+        "As before, the derivative of the loss function tells us the best way towards the minimum value.\n",
+        "\n",
+        "On a 2D (or higher) surface, the best way to go down (gradient) is described by a *vector*.\n",
+        "\n",
+        "<div align=\"middle\">\n",
+        "  <table style=\"width:100%\">\n",
+        "    <tr align=\"center\">\n",
+        "      <td><img src=\"images/loss_surface.png\" alt='loss_surface' width='600'>\n",
+        "      </td>\n",
+        "    </tr>\n",
+        "  </table>\n",
+        "</div>\n",
+        "\n",
+        "> Math Aside: Partial Derivatives \n",
+        "\n",
+        "> - For an equation with multiple variables, we take a **partial derivative** by differentiating with respect to just one variable at a time. The partial derivative is denoted with a $\\partial$. Intuitively, we want to see how the function changes if we only vary one variable while holding other variables constant. \n",
+        "> - Using $f(x, y) = 3x^2 + y$ as an example,\n",
+        ">   - taking the partial derivative with respect to x and treating y as a constant gives us $\\frac{\\partial f}{\\partial x} = 6x$\n",
+        ">   - taking the partial derivative with respect to y and treating x as a constant gives us $\\frac{\\partial f}{\\partial y} = 1$\n",
+        "\n",
+        "For the *vector* of parameter values $\\vec{\\theta} = \\begin{bmatrix}\n",
+        "           \\theta_{0} \\\\\n",
+        "           \\theta_{1} \\\\\n",
+        "         \\end{bmatrix}$, we take the *partial derivative* of loss with respect to each parameter: $\\frac{\\partial L}{\\partial \\theta_0}$ and $\\frac{\\partial L}{\\partial \\theta_1}$.\n",
+        "\n",
+        "> For example, consider the 2D function: $$f(\\theta_0, \\theta_1) = 8 \\theta_0^2 + 3\\theta_0\\theta_1$$\n",
+        "> For a function of 2 variables $f(\\theta_0, \\theta_1)$, we define the gradient \n",
+        "$$\n",
+        "\\begin{align}\n",
+        "\\frac{\\partial f}{\\partial \\theta_{0}} &= 16\\theta_0 + 3\\theta_1 \\\\\n",
+        "\\frac{\\partial f}{\\partial \\theta_{1}} &= 3\\theta_0 \\\\\n",
+        "\\nabla_{\\vec{\\theta}} f(\\vec{\\theta}) &=  \\begin{bmatrix} 16\\theta_0 + 3\\theta_1 \\\\ 3\\theta_0 \\\\ \\end{bmatrix}\n",
+        "\\end{align}\n",
+        "$$\n",
+        "\n",
+        "\n",
+        "The **gradient vector** of a generic function of $p+1$ variables is therefore \n",
+        "$$\\nabla_{\\vec{\\theta}} L =  \\begin{bmatrix} \\frac{\\partial L}{\\partial \\theta_0} \\\\ \\frac{\\partial L}{\\partial \\theta_1} \\\\ \\vdots \\end{bmatrix}$$\n",
+        "where $\\nabla_\\theta L$ always points in the downhill direction of the surface. We can interpret each gradient as: \"If I nudge the $i$th model weight, what happens to loss?\"\n",
+        "\n",
+        "We can use this to update our 1D gradient rule for models with multiple parameters. \n",
+        "\n",
+        "* Recall our 1D update rule: $$\\theta^{(t+1)} = \\theta^{(t)} - \\alpha \\frac{d}{d\\theta}L(\\theta^{(t)})$$ \n",
+        "* For models with multiple parameters, we work in terms of vectors:\n",
+        "$$\\begin{bmatrix}\n",
+        "           \\theta_{0}^{(t+1)} \\\\\n",
+        "           \\theta_{1}^{(t+1)} \\\\\n",
+        "           \\vdots\n",
+        "         \\end{bmatrix} = \\begin{bmatrix}\n",
+        "           \\theta_{0}^{(t)} \\\\\n",
+        "           \\theta_{1}^{(t)} \\\\\n",
+        "           \\vdots\n",
+        "         \\end{bmatrix} - \\alpha \\begin{bmatrix}\n",
+        "           \\frac{\\partial L}{\\partial \\theta_{0}} \\\\\n",
+        "           \\frac{\\partial L}{\\partial \\theta_{1}} \\\\\n",
+        "           \\vdots \\\\\n",
+        "         \\end{bmatrix}$$\n",
+        "  \n",
+        "* Written in a more compact form, $$\\vec{\\theta}^{(t+1)} = \\vec{\\theta}^{(t)} - \\alpha \\nabla_{\\vec{\\theta}} L(\\theta^{(t)}) $$\n",
+        "\n",
+        "  * $\\theta$ is a vector with our model weights\n",
+        "  * $L$ is the loss function\n",
+        "  * $\\alpha$ is the learning rate (ours is constant, but other techniques use an $\\alpha$ that decreases over time)\n",
+        "  * $\\vec{\\theta}^{(t)}$ is the current value of $\\theta$\n",
+        "  * $\\vec{\\theta}^{(t+1)}$ is the next value of $\\theta$\n",
+        "  * $\\nabla_{\\vec{\\theta}} L(\\theta^{(t)})$ is the gradient of the loss function evaluated at the current $\\vec{\\theta}^{(t)}$\n",
+        "\n",
+        "\n",
+        "### Batch Gradient Descent and Stochastic Gradient Descent\n",
+        "\n",
+        "Formally, the algorithm we derived above is called **batch gradient descent.** For each iteration of the algorithm, the derivative of loss is computed across the *entire* batch of all $n$ datapoints. While this update rule works well in theory, it is not practical in most circumstances. For large datasets (with perhaps billions of datapoints), finding the gradient across all the data is incredibly computationally taxing; gradient descent will converge slowly because each individual update is slow.\n",
+        "\n",
+        "**Stochastic (mini-batch) gradient descent** tries to address this issue. In stochastic descent, only a *sample* of the full dataset is used at each update. We estimate the true gradient of the loss surface using just that sample of data. The **batch size** is the number of data points used in each sample. The sampling strategy is generally without replacement (data is shuffled and batch size examples are selected one at a time.)\n",
+        "\n",
+        "Each complete \"pass\" through the data is known as a **training epoch**. After shuffling the data, in a single **training epoch** of stochastic gradient descent, we\n",
+        "* Compute the gradient on the first x% of the data. Update the parameter guesses.\n",
+        "* Compute the gradient on the next x% of the data. Update the parameter guesses.\n",
+        "* $\\dots$\n",
+        "* Compute the gradient on the last x% of the data. Update the parameter guesses.\n",
+        "\n",
+        "Every data point appears once in a single training epoch. We then perform several training epochs until we're satisfied.\n",
+        "\n",
+        "Batch gradient descent is a deterministic technique – because the entire dataset is used at each update iteration, the algorithm will always advance towards the minimum of the loss surface. In contrast, stochastic gradient descent involve an element of randomness. Since only a subset of the full data is used to update the guess for $\\vec{\\theta}$ at each iteration, there's a chance the algorithm will not progress towards the true minimum of loss with each update. Over the longer term, these stochastic techniques should still converge towards the optimal solution. \n",
+        "\n",
+        "The diagrams below represent a \"bird's eye view\" of a loss surface from above. Notice that batch gradient descent takes a direct path towards the optimal $\\hat{\\theta}$. Stochastic gradient descent, in contrast, \"hops around\" on its path to the minimum point on the loss surface. This reflects the randomness of the sampling process at each update step.\n",
+        "\n",
+        "<div align=\"middle\">\n",
+        "  <table style=\"width:100%\">\n",
+        "    <tr align=\"center\">\n",
+        "      <td><img src=\"images/stochastic.png\" alt='stochastic' width='600'>\n",
+        "      </td>\n",
+        "    </tr>\n",
+        "  </table>\n",
+        "</div>\n",
+        "\n",
+        "To summarize the tradeoffs of batch size: \n",
+        "\n",
+        "| - | Smaller Batch Size | Larger Batch Size | \n",
+        "| -- | -- | -- | \n",
+        "| Pros | More frequent gradient updates | Leverage hardware acceleration to improve overall system performance and higher quality gradient updates | \n",
+        "| Cons | More variability in the gradient estimates | Less frequent gradient updates |\n",
+        "\n",
+        "The typical solution is to set batch size to ensure sufficient hardware utilization.\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "name": "python3",
+      "language": "python",
+      "display_name": "Python 3 (ipykernel)"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/gradient_descent/gradient_descent.qmd b/gradient_descent/gradient_descent.qmd
index a5e3cda2..253752f2 100644
--- a/gradient_descent/gradient_descent.qmd
+++ b/gradient_descent/gradient_descent.qmd
@@ -726,4 +726,3 @@ To summarize the tradeoffs of batch size:
 
 The typical solution is to set batch size to ensure sufficient hardware utilization.
 
-

	total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4
	day_Fri	day_Sat	day_Sun	day_Thur
0	0.0	0.0	1.0	0.0
1	0.0	0.0	1.0	0.0
2	0.0	0.0	1.0	0.0
3	0.0	0.0	1.0	0.0
4	0.0	0.0	1.0	0.0
	Feature	Model Coefficient
0	total_bill	0.092994
1	size	0.187132
2	day_Fri	0.745787
3	day_Sat	0.621129
4	day_Sun	0.732289
5	day_Thur	0.668294