From 8561e627f55928d6d8fe0101581db026fdef83c6 Mon Sep 17 00:00:00 2001
From: Nikhil Reddy <nsreddy@berkeley.edu>
Date: Mon, 30 Sep 2024 18:52:13 -0700
Subject: [PATCH] fix note 10 and 11

---
 .../loss_transformations.qmd                  | 22 ++-----------------
 intro_to_modeling/intro_to_modeling.qmd       |  8 +++----
 2 files changed, 5 insertions(+), 25 deletions(-)

diff --git a/constant_model_loss_transformations/loss_transformations.qmd b/constant_model_loss_transformations/loss_transformations.qmd
index 0ab34966..fc8fd9f8 100644
--- a/constant_model_loss_transformations/loss_transformations.qmd
+++ b/constant_model_loss_transformations/loss_transformations.qmd
@@ -90,7 +90,6 @@ Let's take a look at four different datasets.
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
@@ -102,7 +101,6 @@ from mpl_toolkits.mplot3d import Axes3D
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 # Big font helper
 def adjust_fontsize(size=None):
     SMALL_SIZE = 8
@@ -155,7 +153,6 @@ plt.style.use("default")  # Revert style to default mpl
 ```
 
 ```{python}
-#| vscode: {languageId: python}
 plt.style.use("default")  # Revert style to default mpl
 NO_VIZ, RESID, RESID_SCATTER = range(3)
 
@@ -194,7 +191,6 @@ def least_squares_evaluation(x, y, visualize=NO_VIZ):
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 # Load in four different datasets: I, II, III, IV
 x = [10, 8, 13, 9, 11, 14, 6, 4, 12, 7, 5]
 y1 = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]
@@ -231,7 +227,6 @@ While these four sets of datapoints look very different, they actually all have
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 for dataset in ["I", "II", "III", "IV"]:
     print(f">>> Dataset {dataset}:")
     ans = anscombe[dataset]
@@ -246,7 +241,6 @@ We may also wish to visualize the model's **residuals**, defined as the differen
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 # Residual visualization
 fig, axs = plt.subplots(2, 2, figsize=(10, 10))
 
@@ -366,7 +360,6 @@ The code for generating the graphs and models is included below, but we won't go
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 dugongs = pd.read_csv("data/dugongs.csv")
 data_constant = dugongs["Age"]
 data_linear = dugongs[["Length", "Age"]]
@@ -374,7 +367,6 @@ data_linear = dugongs[["Length", "Age"]]
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 # Constant Model + MSE
 plt.style.use('default') # Revert style to default mpl
 adjust_fontsize(size=16)
@@ -400,7 +392,6 @@ plt.legend();
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 # SLR + MSE
 def mse_linear(theta_0, theta_1, data_linear):
     data_x, data_y = data_linear.iloc[:, 0], data_linear.iloc[:, 1]
@@ -449,14 +440,13 @@ cbar.set_label("Cost Value")
 ax.set_title("MSE for different $\\theta_0, \\theta_1$")
 ax.set_xlabel("$\\theta_0$")
 ax.set_ylabel("$\\theta_1$")
-ax.set_zlabel("MSE")
+ax.set_zlabel("MSE");
 
 # plt.show()
 ```
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 # Predictions
 yobs = data_linear["Age"]  # The true observations y
 xs = data_linear["Length"]  # Needed for linear predictions
@@ -468,7 +458,6 @@ yhats_linear = [theta_0_hat + theta_1_hat * x for x in xs]
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 # Constant Model Rug Plot
 # In case we're in a weird style state
 sns.set_theme()
@@ -485,7 +474,6 @@ plt.yticks([]);
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 # SLR model scatter plot 
 # In case we're in a weird style state
 sns.set_theme()
@@ -599,7 +587,6 @@ Let's consider a dataset where each entry represents the number of drinks sold a
 
 ```{python}
 #| code-fold: false
-#| vscode: {languageId: python}
 drinks = np.array([20, 21, 22, 29, 33])
 drinks
 ```
@@ -608,7 +595,6 @@ From our derivations above, we know that the optimal model parameter under MSE c
 
 ```{python}
 #| code-fold: false
-#| vscode: {languageId: python}
 np.mean(drinks), np.median(drinks)
 ```
 
@@ -622,7 +608,6 @@ How do outliers affect each cost function? Imagine we replace the largest value
 
 ```{python}
 #| code-fold: false
-#| vscode: {languageId: python}
 drinks_with_outlier = np.append(drinks, 1033)
 display(drinks_with_outlier)
 np.mean(drinks_with_outlier), np.median(drinks_with_outlier)
@@ -636,7 +621,6 @@ Let's try another experiment. This time, we'll add an additional, non-outlying d
 
 ```{python}
 #| code-fold: false
-#| vscode: {languageId: python}
 drinks_with_additional_observation = np.append(drinks, 35)
 drinks_with_additional_observation
 ```
@@ -680,7 +664,6 @@ Let's revisit our dugongs example. The lengths and ages are plotted below:
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 # `corrcoef` computes the correlation coefficient between two variables
 # `std` finds the standard deviation
 x = dugongs["Length"]
@@ -708,7 +691,6 @@ An important word on $\log$: in Data 100 (and most upper-division STEM courses),
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 z = np.log(y)
 
 r = np.corrcoef(x, z)[0, 1]
@@ -746,7 +728,6 @@ $y$ is an *exponential* function of $x$. Applying an exponential fit to the untr
 
 ```{python}
 #| code-fold: true
-#| vscode: {languageId: python}
 plt.figure(dpi=120, figsize=(4, 3))
 
 plt.scatter(x, y)
@@ -815,3 +796,4 @@ In the derivation above, we decompose the expected loss, $R(\theta)$, into two k
 - **Variance, $\sigma_y^2$**: This term represents the spread of the data points around their mean, $\bar{y}$, and is a measure of the data's inherent variability. Importantly, it does not depend on the choice of $\theta$, meaning it's a fixed property of the data. Variance serves as an indicator of the data's dispersion and is crucial in understanding the dataset's structure, but it remains constant regardless of how we adjust our model parameter $\theta$.
 
 - **Bias Squared, $(\bar{y} - \theta)^2$**: This term captures the bias of the estimator, defined as the square of the difference between the mean of the data points, $\bar{y}$, and the parameter $\theta$. The bias quantifies the systematic error introduced when estimating $\theta$. Minimizing this term is essential for improving the accuracy of the estimator. When $\theta = \bar{y}$, the bias is $0$, indicating that the estimator is unbiased for the parameter it estimates. This highlights a critical principle in statistical estimation: choosing $\theta$ to be the sample mean, $\bar{y}$, minimizes the average loss, rendering the estimator both efficient and unbiased for the population mean.
+
diff --git a/intro_to_modeling/intro_to_modeling.qmd b/intro_to_modeling/intro_to_modeling.qmd
index e8ccaaea..c3b2299a 100644
--- a/intro_to_modeling/intro_to_modeling.qmd
+++ b/intro_to_modeling/intro_to_modeling.qmd
@@ -96,7 +96,6 @@ The **regression line** is the unique straight line that minimizes the **mean sq
 - $\text{residual} =\text{observed }y - \text{regression estimate}$
 
 ```{python}
-#| vscode: {languageId: python}
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
@@ -105,11 +104,11 @@ import seaborn as sns
 np.random.seed(43)
 plt.style.use('default') 
 
-#Generate random noise for plotting
+# Generate random noise for plotting
 x = np.linspace(-3, 3, 100)
 y = x * 0.5 - 1 + np.random.randn(100) * 0.3
 
-#plot regression line
+# Plot regression line
 sns.regplot(x=x,y=y);
 ```
 
@@ -132,11 +131,10 @@ The correlation ($r$) is the average of the product of $x$ and $y$, both measure
 $$r = \frac{1}{n} \sum_{i=1}^n (\frac{x_i - \bar{x}}{\sigma_x})(\frac{y_i - \bar{y}}{\sigma_y})$$
 
 1. Correlation measures the strength of a **linear association** between two variables.
-2. Correlations range between -1 and 1: $|r| \leq 1$, with $r=1$ indicating perfect linear association, and $r=-1$ indicating perfect negative association. The closer $r$ is to $0$, the weaker the linear association is.
+2. Correlations range between -1 and 1: $|r| \leq 1$, with $r=1$ indicating perfect positive linear association, and $r=-1$ indicating perfect negative association. The closer $r$ is to $0$, the weaker the linear association is.
 3. Correlation says nothing about causation and non-linear association. Correlation does **not** imply causation. When $r = 0$, the two variables are uncorrelated. However, they could still be related through some non-linear relationship.
 
 ```{python}
-#| vscode: {languageId: python}
 def plot_and_get_corr(ax, x, y, title):
     ax.set_xlim(-3, 3)
     ax.set_ylim(-3, 3)