diff --git a/code/lecture7-convnet.ipynb b/code/lecture7-convnet.ipynb
new file mode 100644
index 0000000..cdcd1a3
--- /dev/null
+++ b/code/lecture7-convnet.ipynb
@@ -0,0 +1,342 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Lecture 7: Convolutional networks\n",
+    "\n",
+    "Notebook adapted from [Deep Learning (with PyTorch)](https://github.com/Atcold/pytorch-Deep-Learning) by Alfredo Canziani. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "from torchvision import datasets, transforms\n",
+    "\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "device"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MNIST"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf = transforms.Compose([transforms.ToTensor(),\n",
+    "                         transforms.Normalize((0.1307,), (0.3081,))])\n",
+    "\n",
+    "train_loader = torch.utils.data.DataLoader(datasets.MNIST(\"./data\", train=True, transform=tf),\n",
+    "                                           batch_size=64, shuffle=True)\n",
+    "\n",
+    "test_loader = torch.utils.data.DataLoader(datasets.MNIST(\"./data\", train=False, transform=tf),\n",
+    "                                          batch_size=1000, shuffle=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch = next(iter(train_loader))\n",
+    "x = batch[0][:10]\n",
+    "y = batch[1][:10]\n",
+    "\n",
+    "fig, axs = plt.subplots(1, 5, figsize=(12, 4))\n",
+    "\n",
+    "for i in range(5):\n",
+    "    axs[i].imshow(x[i].squeeze().numpy())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MLP vs ConvNet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MLP(nn.Module):\n",
+    "    def __init__(self, D, H, C):\n",
+    "        super().__init__()\n",
+    "        self.D = D\n",
+    "        self.net = nn.Sequential(\n",
+    "            nn.Linear(D, H),\n",
+    "            nn.ReLU(),\n",
+    "            nn.Linear(H, H),\n",
+    "            nn.ReLU(),\n",
+    "            nn.Linear(H, C),\n",
+    "            # nn.Softmax(dim=-1)\n",
+    "        )\n",
+    "        \n",
+    "    def forward(self, x):\n",
+    "        x = x.view(-1, self.D)\n",
+    "        return self.net(x)\n",
+    "\n",
+    "class ConvNet(nn.Module):\n",
+    "    def __init__(self, D, n_kernels, C):\n",
+    "        super().__init__()\n",
+    "        self.net = nn.Sequential(\n",
+    "            nn.Conv2d(in_channels=1, out_channels=n_kernels, kernel_size=5),\n",
+    "            nn.ReLU(),\n",
+    "            nn.MaxPool2d(kernel_size=2),\n",
+    "            nn.Conv2d(in_channels=n_kernels, out_channels=n_kernels, kernel_size=5),\n",
+    "            nn.ReLU(),\n",
+    "            nn.MaxPool2d(kernel_size=2),\n",
+    "            nn.Flatten(),\n",
+    "            nn.Linear(n_kernels * 4 * 4, 50),\n",
+    "            nn.ReLU(),\n",
+    "            nn.Linear(50, C),\n",
+    "            # nn.Softmax(dim=-1)\n",
+    "        )\n",
+    "        \n",
+    "    def forward(self, x):\n",
+    "        return self.net(x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(model, perm=torch.arange(0, 784).long(), n_epochs=1):\n",
+    "    model.train()    \n",
+    "    optimizer = torch.optim.AdamW(model.parameters())\n",
+    "    \n",
+    "    for epoch in range(n_epochs):\n",
+    "        for i, (data, target) in enumerate(train_loader):\n",
+    "            # send to device\n",
+    "            data, targets = data.to(device), target.to(device)\n",
+    "\n",
+    "            # permute pixels\n",
+    "            data = data.view(-1, 28*28)\n",
+    "            data = data[:, perm]\n",
+    "            data = data.view(-1, 1, 28, 28)\n",
+    "\n",
+    "            # step\n",
+    "            optimizer.zero_grad()\n",
+    "            logits = model(data)\n",
+    "            \n",
+    "            loss = F.cross_entropy(logits, targets)\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "\n",
+    "            if i % 100 == 0:\n",
+    "                print(f\"epoch={epoch}, step={i}: train loss={loss.item():.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def test(model, perm=torch.arange(0, 784).long()):\n",
+    "    model.eval()\n",
+    "    \n",
+    "    test_loss = 0\n",
+    "    correct = 0\n",
+    "    \n",
+    "    for data, targets in test_loader:\n",
+    "        # send to device\n",
+    "        data, targets = data.to(device), targets.to(device)\n",
+    "        \n",
+    "        # permute pixels\n",
+    "        data = data.view(-1, 28*28)\n",
+    "        data = data[:, perm]\n",
+    "        data = data.view(-1, 1, 28, 28)\n",
+    "        \n",
+    "        # metrics\n",
+    "        logits = model(data)\n",
+    "        test_loss += F.cross_entropy(logits, targets, reduction='sum').item()\n",
+    "        preds = torch.argmax(logits, dim=1)     \n",
+    "        correct += (preds == targets).sum()\n",
+    "\n",
+    "    test_loss /= len(test_loader.dataset)\n",
+    "    accuracy = correct / len(test_loader.dataset)\n",
+    "    \n",
+    "    print(f\"test loss={test_loss:.4f}, accuracy={accuracy:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# MLP\n",
+    "D = 28*28  \n",
+    "C = 10   \n",
+    "H = 8\n",
+    "\n",
+    "mlp = MLP(D, H, C)\n",
+    "mlp.to(device)\n",
+    "print(f\"Parameters={sum(p.numel() for p in mlp.parameters())/1e3}K\")\n",
+    "\n",
+    "train(mlp)\n",
+    "test(mlp)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ConvNet, with the same number of parameters\n",
+    "n_kernels = 6\n",
+    "\n",
+    "convnet = ConvNet(D, n_kernels, C)\n",
+    "convnet.to(device)\n",
+    "print(f\"Parameters={sum(p.numel() for p in convnet.parameters())/1e3}K\")\n",
+    "\n",
+    "train(convnet)\n",
+    "test(convnet)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The convolutional network performs better with the same number of parameters, thanks to its use of prior knowledge about images:\n",
+    "\n",
+    "* Use of convolution: Locality and stationarity in images\n",
+    "* Pooling: builds in some translation invariance\n",
+    "\n",
+    "What if those assumptions are wrong?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MLP vs ConvNet, on shuffled pixels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "perm = torch.randperm(784)\n",
+    "\n",
+    "batch = next(iter(train_loader))\n",
+    "x = batch[0][:10]\n",
+    "y = batch[1][:10]\n",
+    "\n",
+    "fig, axs = plt.subplots(1, 5, figsize=(12, 4))\n",
+    "\n",
+    "for i in range(5):\n",
+    "    axs[i].imshow(x[i].squeeze().numpy())\n",
+    "    \n",
+    "fig, axs = plt.subplots(1, 5, figsize=(12, 4))\n",
+    "x = x.view(-1, 28*28)\n",
+    "x = x[:, perm]\n",
+    "x = x.view(-1, 1, 28, 28)\n",
+    "\n",
+    "for i in range(5):\n",
+    "    axs[i].imshow(x[i].squeeze().numpy())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ConvNet on shuffled pixels\n",
+    "n_kernels = 6\n",
+    "\n",
+    "convnet = ConvNet(D, n_kernels, C)\n",
+    "convnet.to(device)\n",
+    "print(f\"Parameters={sum(p.numel() for p in convnet.parameters())/1e3}K\")\n",
+    "\n",
+    "train(convnet, perm=perm)\n",
+    "test(convnet, perm=perm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# MLP on shuffled pixels\n",
+    "D = 28*28  \n",
+    "C = 10   \n",
+    "H = 8\n",
+    "\n",
+    "mlp = MLP(D, n_hidden, output_size)\n",
+    "mlp.to(device)\n",
+    "print(f\"Parameters={sum(p.numel() for p in mlp.parameters())/1e3}K\")\n",
+    "\n",
+    "train(mlp, perm=perm)\n",
+    "test(mlp, perm=perm)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The convolutional network's performance drops when we permute the pixels, but the MLP's performance stays the same.\n",
+    "* ConvNet makes the assumption that pixels lie on a grid and are stationary/local.\n",
+    "* It loses performance when this assumption is wrong.\n",
+    "* The fully-connected network does not make this assumption.\n",
+    "* It does less well when it is true, since it doesn't take advantage of this prior knowledge.\n",
+    "* But it doesn't suffer when the assumption is wrong."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch-cpu",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/code/lecture7-spiral.ipynb b/code/lecture7-spiral.ipynb
index 73777a5..9018675 100644
--- a/code/lecture7-spiral.ipynb
+++ b/code/lecture7-spiral.ipynb
@@ -142,7 +142,7 @@
     "\n",
     "model.to(device)  \n",
     "criterion = torch.nn.BCELoss()\n",
-    "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5) \n",
+    "optimizer = torch.optim.AdamW(model.parameters()) \n",
     "\n",
     "# Training\n",
     "for t in range(2000):\n",
@@ -181,7 +181,7 @@
     "\n",
     "model.to(device)  \n",
     "criterion = torch.nn.NLLLoss()\n",
-    "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5) \n",
+    "optimizer = torch.optim.AdamW(model.parameters()) \n",
     "\n",
     "# Training\n",
     "for t in range(2000):\n",
@@ -219,6 +219,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# To show and discuss:\n",
+    "# - Effect of the non-linearity\n",
+    "# - Effect of H \n",
+    "# - Effect of the number of layers \n",
+    "# - Effect of the activation function\n",
+    "# - Multi-class classification\n",
+    "\n",
     "H = 100\n",
     "\n",
     "model = torch.nn.Sequential(\n",
@@ -228,21 +235,9 @@
     "    torch.nn.Softmax(dim=1)\n",
     ")\n",
     "\n",
-    "# model = torch.nn.Sequential(\n",
-    "#     torch.nn.Linear(D, H),\n",
-    "#     torch.nn.ReLU(),\n",
-    "#     torch.nn.Linear(H, H),\n",
-    "#     torch.nn.ReLU(),\n",
-    "#     torch.nn.Linear(H, H),\n",
-    "#     torch.nn.ReLU(),\n",
-    "#     torch.nn.Linear(H, H),\n",
-    "#     torch.nn.ReLU(),\n",
-    "#     torch.nn.Linear(H, C)\n",
-    "# )\n",
-    "\n",
     "model.to(device)\n",
     "criterion = torch.nn.NLLLoss()\n",
-    "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5) \n",
+    "optimizer = torch.optim.AdamW(model.parameters()) \n",
     "\n",
     "for t in range(2000):\n",
     "    y_pred = model(X)\n",
diff --git a/lecture7.md b/lecture7.md
index 1bfb8b3..4a6c3ab 100755
--- a/lecture7.md
+++ b/lecture7.md
@@ -402,6 +402,10 @@ y\_k &= \sum\_{j=0}^{q-1} v\_{kj} h\_j + c\_k,
 \end{aligned}$$
 where $w\_{ji}$, $b\_j$, $v\_{kj}$ and $c\_k$ ($i=0, ..., d\_\text{in}-1$, $j=0, ..., q-1$, $k=0, ..., d\_\text{out}-1$) are the model parameters and $\sigma$ is an activation function.
 
+???
+
+Draw the (generic) architecture of a shallow network.
+
 ---
 
 class: middle
@@ -420,12 +424,15 @@ where $w\_{0}$, $w\_{1}$, $w\_{2}$, $b\_0$, $b\_1$, $b\_2$, $v\_{0}$, $v\_{1}$,
 
 class: middle
 
-.center.width-100[![](figures/lec7/ShallowFunctions.svg)]
+.center.width-100[![](figures/lec7/ShallowNet.svg)]
 
-This network defines a family of piecewise linear functions where the positions of the joints, the slopes and the heights of the functions are determined by the 10 parameters $w\_{0}$, $w\_{1}$, $w\_{2}$, $b\_0$, $b\_1$, $b\_2$, $v\_{0}$, $v\_{1}$, $v\_{2}$ and $c$.
+a) The input $x$ is on the left, the hidden units $h_0$, $h_1$ and $h_2$ are in the middle, and the output $y$ is on the right. Computation flows from left to right. 
+
+b) More compact representation of the same network where we omit the bias terms, the weight labels and the activation functions.
 
 .footnote[Credits: [Simon J.D. Prince](https://udlbook.github.io/udlbook/), 2023.]
 
+
 ---
 
 class: middle
@@ -438,11 +445,9 @@ class: middle
 
 class: middle
 
-.center.width-100[![](figures/lec7/ShallowNet.svg)]
-
-a) The input $x$ is on the left, the hidden units $h_0$, $h_1$ and $h_2$ are in the middle, and the output $y$ is on the right. Computation flows from left to right. 
+.center.width-100[![](figures/lec7/ShallowFunctions.svg)]
 
-b) More compact representation of the same network where we omit the bias terms, the weight labels and the activation functions.
+This network defines a family of piecewise linear functions where the positions of the joints, the slopes and the heights of the functions are determined by the 10 parameters $w\_{0}$, $w\_{1}$, $w\_{2}$, $b\_0$, $b\_1$, $b\_2$, $v\_{0}$, $v\_{1}$, $v\_{2}$ and $c$.
 
 .footnote[Credits: [Simon J.D. Prince](https://udlbook.github.io/udlbook/), 2023.]
 
@@ -502,16 +507,13 @@ class: middle
 
 To extend the network to multivariate inputs $\mathbf{x} = [x\_0, x\_1, ..., x\_{d\_{\text{in}}-1}]$, we extend the linear relations between the input and the hidden units. 
 
-For example, a network with two inputs $\mathbf{x} = [x\_0, x\_1]$ and a scalar output $y$ might have three hidden units $h\_0$, $h\_1$ and $h\_2$ defined as
+For example, a network with two inputs $\mathbf{x} = [x\_0, x\_1]$ might have three hidden units $h\_0$, $h\_1$ and $h\_2$ defined as
 $$\begin{aligned}
 h\_0 &= \sigma\left( w\_{00} x\_0 + w\_{01} x\_1 + b\_0 \right) \\\\
 h\_1 &= \sigma\left( w\_{10} x\_0 + w\_{11} x\_1 + b\_1 \right) \\\\
 h\_2 &= \sigma\left( w\_{20} x\_0 + w\_{21} x\_1 + b\_2 \right).
 \end{aligned}$$
 
-The hidden units are then combined to produce the output $y$ as
-$$y = v\_0 h\_0 + v\_1 h\_1 + v\_2 h\_2 + c.$$
-
 ---
 
 class: middle
@@ -589,11 +591,11 @@ h\_0 &= \sigma\left( w\_{0} x + b\_0 \right) \\\\
 h\_1 &= \sigma\left( w\_{1} x + b\_1 \right) \\\\
 h\_2 &= \sigma\left( w\_{2} x + b\_2 \right),
 \end{aligned}$$
-the second layer is defined as
+the second layer is defined from the outputs of the first layer as
 $$\begin{aligned}
 h\_0' &= \sigma\left( w'\_{00} h\_0 + w'\_{01} h\_1 + w'\_{02} h\_2 + b'\_0 \right) \\\\
 h\_1' &= \sigma\left( w'\_{10} h\_0 + w'\_{11} h\_1 + w'\_{12} h\_2 + b'\_1 \right) \\\\
-h\_2' &= \sigma\left( w'\_{20} h\_0 + w'\_{21} h\_1 + w'\_{22} j\_2 + b'\_2 \right)
+h\_2' &= \sigma\left( w'\_{20} h\_0 + w'\_{21} h\_1 + w'\_{22} j\_2 + b'\_2 \right),
 \end{aligned}$$
 and the output is defined as
 $$y = v\_0 h\_0' + v\_1 h\_1' + v\_2 h\_2' + c.$$
@@ -626,7 +628,7 @@ $$\begin{aligned}
 \mathbf{h} &= \begin{bmatrix} h\_0 \\\\ h\_1 \\\\ \vdots \\\\ h\_{q-1} \end{bmatrix} = \sigma\left( \begin{bmatrix} w\_{00} & w\_{01} & \cdots & w\_{0(d\_\text{in}-1)} \\\\ w\_{10} & w\_{11} & \cdots & w\_{1(d\_\text{in}-1)} \\\\ \vdots & \vdots & \ddots & \vdots \\\\ w\_{(q-1)0} & w\_{(q-1)1} & \cdots & w\_{(q-1)(d\_\text{in}-1)} \end{bmatrix} \begin{bmatrix} x\_0 \\\\ x\_1 \\\\ \vdots \\\\ x\_{d\_\text{in}-1} \end{bmatrix} + \begin{bmatrix} b\_0 \\\\ b\_1 \\\\ \vdots \\\\ b\_{q-1} \end{bmatrix} \right) \\\\
 &= \sigma(\mathbf{W}^T \mathbf{x} + \mathbf{b})
 \end{aligned}$$
-where $\mathbf{W} \in \mathbb{R}^{d\_\text{in} \times q}$ is the weight matrix of the hidden layer and $\mathbf{b} \in \mathbb{R}^{q}$ is the bias vector of the hidden layer.
+where $\mathbf{x} \in \mathbb{R}^{d\_\text{in}}$ is the input vector, $\mathbf{W} \in \mathbb{R}^{d\_\text{in} \times q}$ is the weight matrix of the hidden layer and $\mathbf{b} \in \mathbb{R}^{q}$ is the bias vector.
 
 ---
 
@@ -695,7 +697,7 @@ class: middle
 ## MLPs on images?
 
 The MLP architecture is appropriate for tabular data, but not for images.
-- Each pixel of an image is a feature, leading to a high-dimensional input vector.
+- Each pixel of an image is an input feature, leading to a high-dimensional input vector.
 - Each hidden unit is connected to all input units, leading to a high-dimensional weight matrix.
 
 ---
@@ -762,6 +764,10 @@ class: middle
 
 .footnote[Credits: Francois Fleuret, [EE559 Deep Learning](https://fleuret.org/ee559/), EPFL.]
 
+???
+
+Give some intuition about the interpretation of the convolution in terms of similarity between the input and the kernel.
+
 ---
 
 class: middle