diff --git a/code/lecture7-convnet.ipynb b/code/lecture7-convnet.ipynb new file mode 100644 index 0000000..cdcd1a3 --- /dev/null +++ b/code/lecture7-convnet.ipynb @@ -0,0 +1,342 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lecture 7: Convolutional networks\n", + "\n", + "Notebook adapted from [Deep Learning (with PyTorch)](https://github.com/Atcold/pytorch-Deep-Learning) by Alfredo Canziani. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "from torchvision import datasets, transforms\n", + "\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", + "device" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MNIST" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tf = transforms.Compose([transforms.ToTensor(),\n", + " transforms.Normalize((0.1307,), (0.3081,))])\n", + "\n", + "train_loader = torch.utils.data.DataLoader(datasets.MNIST(\"./data\", train=True, transform=tf),\n", + " batch_size=64, shuffle=True)\n", + "\n", + "test_loader = torch.utils.data.DataLoader(datasets.MNIST(\"./data\", train=False, transform=tf),\n", + " batch_size=1000, shuffle=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "batch = next(iter(train_loader))\n", + "x = batch[0][:10]\n", + "y = batch[1][:10]\n", + "\n", + "fig, axs = plt.subplots(1, 5, figsize=(12, 4))\n", + "\n", + "for i in range(5):\n", + " axs[i].imshow(x[i].squeeze().numpy())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MLP vs ConvNet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class MLP(nn.Module):\n", + " def __init__(self, D, H, C):\n", + " super().__init__()\n", + " self.D = D\n", + " self.net = nn.Sequential(\n", + " nn.Linear(D, H),\n", + " nn.ReLU(),\n", + " nn.Linear(H, H),\n", + " nn.ReLU(),\n", + " nn.Linear(H, C),\n", + " # nn.Softmax(dim=-1)\n", + " )\n", + " \n", + " def forward(self, x):\n", + " x = x.view(-1, self.D)\n", + " return self.net(x)\n", + "\n", + "class ConvNet(nn.Module):\n", + " def __init__(self, D, n_kernels, C):\n", + " super().__init__()\n", + " self.net = nn.Sequential(\n", + " nn.Conv2d(in_channels=1, out_channels=n_kernels, kernel_size=5),\n", + " nn.ReLU(),\n", + " nn.MaxPool2d(kernel_size=2),\n", + " nn.Conv2d(in_channels=n_kernels, out_channels=n_kernels, kernel_size=5),\n", + " nn.ReLU(),\n", + " nn.MaxPool2d(kernel_size=2),\n", + " nn.Flatten(),\n", + " nn.Linear(n_kernels * 4 * 4, 50),\n", + " nn.ReLU(),\n", + " nn.Linear(50, C),\n", + " # nn.Softmax(dim=-1)\n", + " )\n", + " \n", + " def forward(self, x):\n", + " return self.net(x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def train(model, perm=torch.arange(0, 784).long(), n_epochs=1):\n", + " model.train() \n", + " optimizer = torch.optim.AdamW(model.parameters())\n", + " \n", + " for epoch in range(n_epochs):\n", + " for i, (data, target) in enumerate(train_loader):\n", + " # send to device\n", + " data, targets = data.to(device), target.to(device)\n", + "\n", + " # permute pixels\n", + " data = data.view(-1, 28*28)\n", + " data = data[:, perm]\n", + " data = data.view(-1, 1, 28, 28)\n", + "\n", + " # step\n", + " optimizer.zero_grad()\n", + " logits = model(data)\n", + " \n", + " loss = F.cross_entropy(logits, targets)\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " if i % 100 == 0:\n", + " print(f\"epoch={epoch}, step={i}: train loss={loss.item():.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def test(model, perm=torch.arange(0, 784).long()):\n", + " model.eval()\n", + " \n", + " test_loss = 0\n", + " correct = 0\n", + " \n", + " for data, targets in test_loader:\n", + " # send to device\n", + " data, targets = data.to(device), targets.to(device)\n", + " \n", + " # permute pixels\n", + " data = data.view(-1, 28*28)\n", + " data = data[:, perm]\n", + " data = data.view(-1, 1, 28, 28)\n", + " \n", + " # metrics\n", + " logits = model(data)\n", + " test_loss += F.cross_entropy(logits, targets, reduction='sum').item()\n", + " preds = torch.argmax(logits, dim=1) \n", + " correct += (preds == targets).sum()\n", + "\n", + " test_loss /= len(test_loader.dataset)\n", + " accuracy = correct / len(test_loader.dataset)\n", + " \n", + " print(f\"test loss={test_loss:.4f}, accuracy={accuracy:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# MLP\n", + "D = 28*28 \n", + "C = 10 \n", + "H = 8\n", + "\n", + "mlp = MLP(D, H, C)\n", + "mlp.to(device)\n", + "print(f\"Parameters={sum(p.numel() for p in mlp.parameters())/1e3}K\")\n", + "\n", + "train(mlp)\n", + "test(mlp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ConvNet, with the same number of parameters\n", + "n_kernels = 6\n", + "\n", + "convnet = ConvNet(D, n_kernels, C)\n", + "convnet.to(device)\n", + "print(f\"Parameters={sum(p.numel() for p in convnet.parameters())/1e3}K\")\n", + "\n", + "train(convnet)\n", + "test(convnet)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The convolutional network performs better with the same number of parameters, thanks to its use of prior knowledge about images:\n", + "\n", + "* Use of convolution: Locality and stationarity in images\n", + "* Pooling: builds in some translation invariance\n", + "\n", + "What if those assumptions are wrong?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MLP vs ConvNet, on shuffled pixels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "perm = torch.randperm(784)\n", + "\n", + "batch = next(iter(train_loader))\n", + "x = batch[0][:10]\n", + "y = batch[1][:10]\n", + "\n", + "fig, axs = plt.subplots(1, 5, figsize=(12, 4))\n", + "\n", + "for i in range(5):\n", + " axs[i].imshow(x[i].squeeze().numpy())\n", + " \n", + "fig, axs = plt.subplots(1, 5, figsize=(12, 4))\n", + "x = x.view(-1, 28*28)\n", + "x = x[:, perm]\n", + "x = x.view(-1, 1, 28, 28)\n", + "\n", + "for i in range(5):\n", + " axs[i].imshow(x[i].squeeze().numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ConvNet on shuffled pixels\n", + "n_kernels = 6\n", + "\n", + "convnet = ConvNet(D, n_kernels, C)\n", + "convnet.to(device)\n", + "print(f\"Parameters={sum(p.numel() for p in convnet.parameters())/1e3}K\")\n", + "\n", + "train(convnet, perm=perm)\n", + "test(convnet, perm=perm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# MLP on shuffled pixels\n", + "D = 28*28 \n", + "C = 10 \n", + "H = 8\n", + "\n", + "mlp = MLP(D, n_hidden, output_size)\n", + "mlp.to(device)\n", + "print(f\"Parameters={sum(p.numel() for p in mlp.parameters())/1e3}K\")\n", + "\n", + "train(mlp, perm=perm)\n", + "test(mlp, perm=perm)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The convolutional network's performance drops when we permute the pixels, but the MLP's performance stays the same.\n", + "* ConvNet makes the assumption that pixels lie on a grid and are stationary/local.\n", + "* It loses performance when this assumption is wrong.\n", + "* The fully-connected network does not make this assumption.\n", + "* It does less well when it is true, since it doesn't take advantage of this prior knowledge.\n", + "* But it doesn't suffer when the assumption is wrong." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "torch-cpu", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/code/lecture7-spiral.ipynb b/code/lecture7-spiral.ipynb index 73777a5..9018675 100644 --- a/code/lecture7-spiral.ipynb +++ b/code/lecture7-spiral.ipynb @@ -142,7 +142,7 @@ "\n", "model.to(device) \n", "criterion = torch.nn.BCELoss()\n", - "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5) \n", + "optimizer = torch.optim.AdamW(model.parameters()) \n", "\n", "# Training\n", "for t in range(2000):\n", @@ -181,7 +181,7 @@ "\n", "model.to(device) \n", "criterion = torch.nn.NLLLoss()\n", - "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5) \n", + "optimizer = torch.optim.AdamW(model.parameters()) \n", "\n", "# Training\n", "for t in range(2000):\n", @@ -219,6 +219,13 @@ "metadata": {}, "outputs": [], "source": [ + "# To show and discuss:\n", + "# - Effect of the non-linearity\n", + "# - Effect of H \n", + "# - Effect of the number of layers \n", + "# - Effect of the activation function\n", + "# - Multi-class classification\n", + "\n", "H = 100\n", "\n", "model = torch.nn.Sequential(\n", @@ -228,21 +235,9 @@ " torch.nn.Softmax(dim=1)\n", ")\n", "\n", - "# model = torch.nn.Sequential(\n", - "# torch.nn.Linear(D, H),\n", - "# torch.nn.ReLU(),\n", - "# torch.nn.Linear(H, H),\n", - "# torch.nn.ReLU(),\n", - "# torch.nn.Linear(H, H),\n", - "# torch.nn.ReLU(),\n", - "# torch.nn.Linear(H, H),\n", - "# torch.nn.ReLU(),\n", - "# torch.nn.Linear(H, C)\n", - "# )\n", - "\n", "model.to(device)\n", "criterion = torch.nn.NLLLoss()\n", - "optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5) \n", + "optimizer = torch.optim.AdamW(model.parameters()) \n", "\n", "for t in range(2000):\n", " y_pred = model(X)\n", diff --git a/lecture7.md b/lecture7.md index 1bfb8b3..4a6c3ab 100755 --- a/lecture7.md +++ b/lecture7.md @@ -402,6 +402,10 @@ y\_k &= \sum\_{j=0}^{q-1} v\_{kj} h\_j + c\_k, \end{aligned}$$ where $w\_{ji}$, $b\_j$, $v\_{kj}$ and $c\_k$ ($i=0, ..., d\_\text{in}-1$, $j=0, ..., q-1$, $k=0, ..., d\_\text{out}-1$) are the model parameters and $\sigma$ is an activation function. +??? + +Draw the (generic) architecture of a shallow network. + --- class: middle @@ -420,12 +424,15 @@ where $w\_{0}$, $w\_{1}$, $w\_{2}$, $b\_0$, $b\_1$, $b\_2$, $v\_{0}$, $v\_{1}$, class: middle -.center.width-100[![](figures/lec7/ShallowFunctions.svg)] +.center.width-100[![](figures/lec7/ShallowNet.svg)] -This network defines a family of piecewise linear functions where the positions of the joints, the slopes and the heights of the functions are determined by the 10 parameters $w\_{0}$, $w\_{1}$, $w\_{2}$, $b\_0$, $b\_1$, $b\_2$, $v\_{0}$, $v\_{1}$, $v\_{2}$ and $c$. +a) The input $x$ is on the left, the hidden units $h_0$, $h_1$ and $h_2$ are in the middle, and the output $y$ is on the right. Computation flows from left to right. + +b) More compact representation of the same network where we omit the bias terms, the weight labels and the activation functions. .footnote[Credits: [Simon J.D. Prince](https://udlbook.github.io/udlbook/), 2023.] + --- class: middle @@ -438,11 +445,9 @@ class: middle class: middle -.center.width-100[![](figures/lec7/ShallowNet.svg)] - -a) The input $x$ is on the left, the hidden units $h_0$, $h_1$ and $h_2$ are in the middle, and the output $y$ is on the right. Computation flows from left to right. +.center.width-100[![](figures/lec7/ShallowFunctions.svg)] -b) More compact representation of the same network where we omit the bias terms, the weight labels and the activation functions. +This network defines a family of piecewise linear functions where the positions of the joints, the slopes and the heights of the functions are determined by the 10 parameters $w\_{0}$, $w\_{1}$, $w\_{2}$, $b\_0$, $b\_1$, $b\_2$, $v\_{0}$, $v\_{1}$, $v\_{2}$ and $c$. .footnote[Credits: [Simon J.D. Prince](https://udlbook.github.io/udlbook/), 2023.] @@ -502,16 +507,13 @@ class: middle To extend the network to multivariate inputs $\mathbf{x} = [x\_0, x\_1, ..., x\_{d\_{\text{in}}-1}]$, we extend the linear relations between the input and the hidden units. -For example, a network with two inputs $\mathbf{x} = [x\_0, x\_1]$ and a scalar output $y$ might have three hidden units $h\_0$, $h\_1$ and $h\_2$ defined as +For example, a network with two inputs $\mathbf{x} = [x\_0, x\_1]$ might have three hidden units $h\_0$, $h\_1$ and $h\_2$ defined as $$\begin{aligned} h\_0 &= \sigma\left( w\_{00} x\_0 + w\_{01} x\_1 + b\_0 \right) \\\\ h\_1 &= \sigma\left( w\_{10} x\_0 + w\_{11} x\_1 + b\_1 \right) \\\\ h\_2 &= \sigma\left( w\_{20} x\_0 + w\_{21} x\_1 + b\_2 \right). \end{aligned}$$ -The hidden units are then combined to produce the output $y$ as -$$y = v\_0 h\_0 + v\_1 h\_1 + v\_2 h\_2 + c.$$ - --- class: middle @@ -589,11 +591,11 @@ h\_0 &= \sigma\left( w\_{0} x + b\_0 \right) \\\\ h\_1 &= \sigma\left( w\_{1} x + b\_1 \right) \\\\ h\_2 &= \sigma\left( w\_{2} x + b\_2 \right), \end{aligned}$$ -the second layer is defined as +the second layer is defined from the outputs of the first layer as $$\begin{aligned} h\_0' &= \sigma\left( w'\_{00} h\_0 + w'\_{01} h\_1 + w'\_{02} h\_2 + b'\_0 \right) \\\\ h\_1' &= \sigma\left( w'\_{10} h\_0 + w'\_{11} h\_1 + w'\_{12} h\_2 + b'\_1 \right) \\\\ -h\_2' &= \sigma\left( w'\_{20} h\_0 + w'\_{21} h\_1 + w'\_{22} j\_2 + b'\_2 \right) +h\_2' &= \sigma\left( w'\_{20} h\_0 + w'\_{21} h\_1 + w'\_{22} j\_2 + b'\_2 \right), \end{aligned}$$ and the output is defined as $$y = v\_0 h\_0' + v\_1 h\_1' + v\_2 h\_2' + c.$$ @@ -626,7 +628,7 @@ $$\begin{aligned} \mathbf{h} &= \begin{bmatrix} h\_0 \\\\ h\_1 \\\\ \vdots \\\\ h\_{q-1} \end{bmatrix} = \sigma\left( \begin{bmatrix} w\_{00} & w\_{01} & \cdots & w\_{0(d\_\text{in}-1)} \\\\ w\_{10} & w\_{11} & \cdots & w\_{1(d\_\text{in}-1)} \\\\ \vdots & \vdots & \ddots & \vdots \\\\ w\_{(q-1)0} & w\_{(q-1)1} & \cdots & w\_{(q-1)(d\_\text{in}-1)} \end{bmatrix} \begin{bmatrix} x\_0 \\\\ x\_1 \\\\ \vdots \\\\ x\_{d\_\text{in}-1} \end{bmatrix} + \begin{bmatrix} b\_0 \\\\ b\_1 \\\\ \vdots \\\\ b\_{q-1} \end{bmatrix} \right) \\\\ &= \sigma(\mathbf{W}^T \mathbf{x} + \mathbf{b}) \end{aligned}$$ -where $\mathbf{W} \in \mathbb{R}^{d\_\text{in} \times q}$ is the weight matrix of the hidden layer and $\mathbf{b} \in \mathbb{R}^{q}$ is the bias vector of the hidden layer. +where $\mathbf{x} \in \mathbb{R}^{d\_\text{in}}$ is the input vector, $\mathbf{W} \in \mathbb{R}^{d\_\text{in} \times q}$ is the weight matrix of the hidden layer and $\mathbf{b} \in \mathbb{R}^{q}$ is the bias vector. --- @@ -695,7 +697,7 @@ class: middle ## MLPs on images? The MLP architecture is appropriate for tabular data, but not for images. -- Each pixel of an image is a feature, leading to a high-dimensional input vector. +- Each pixel of an image is an input feature, leading to a high-dimensional input vector. - Each hidden unit is connected to all input units, leading to a high-dimensional weight matrix. --- @@ -762,6 +764,10 @@ class: middle .footnote[Credits: Francois Fleuret, [EE559 Deep Learning](https://fleuret.org/ee559/), EPFL.] +??? + +Give some intuition about the interpretation of the convolution in terms of similarity between the input and the kernel. + --- class: middle