From acc853ba1ae8c1089cf8f1c6fc8303aefb6f5d89 Mon Sep 17 00:00:00 2001 From: Electronic-Waste <2690692950@qq.com> Date: Mon, 30 Sep 2024 11:19:04 +0000 Subject: [PATCH] feat: add minst pytorch example. Signed-off-by: Electronic-Waste <2690692950@qq.com> --- .../mnist-with-push-metrics-collection.ipynb | 313 ++++++++++++++++++ 1 file changed, 313 insertions(+) create mode 100644 examples/v1beta1/sdk/mnist-with-push-metrics-collection.ipynb diff --git a/examples/v1beta1/sdk/mnist-with-push-metrics-collection.ipynb b/examples/v1beta1/sdk/mnist-with-push-metrics-collection.ipynb new file mode 100644 index 00000000000..cd87349cb6b --- /dev/null +++ b/examples/v1beta1/sdk/mnist-with-push-metrics-collection.ipynb @@ -0,0 +1,313 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tune and Train with Push-based Metrics Collection Using MNIST\n", + "\n", + "In this Notebook we are going to do the following:\n", + "- Train PyTorch MNIST image classification model(CNN).\n", + "- Improve the model HyperParameters with [Kubeflow Katib](https://www.kubeflow.org/docs/components/katib/overview/).\n", + "- Use Push-based Metrics Collection to efficiently collect metrics in the training containers." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Kubeflow Python SDKs\n", + "\n", + "You need to install Kubeflow SDKs to run this Notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO (Electronic-Waste): Change to release version when SDK with the updated `tune()` is published.\n", + "%pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Train Script for CNN Model\n", + "\n", + "This is simple **Convolutional Neural Network (CNN)** model for recognizing hand-written digits using [MNIST Dataset](https://yann.lecun.com/exdb/mnist/)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "def train_mnist_model(parameters):\n", + " import torch\n", + " import logging\n", + " import kubeflow.katib as katib\n", + " from torchvision import datasets, transforms\n", + "\n", + " logging.basicConfig(\n", + " format=\"%(asctime)s %(levelname)-8s %(message)s\",\n", + " datefmt=\"%Y-%m-%dT%H:%M:%SZ\",\n", + " level=logging.INFO,\n", + " )\n", + " logging.info(\"--------------------------------------------------------------------------------------\")\n", + " logging.info(f\"Input Parameters: {parameters}\")\n", + " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n", + "\n", + " # Get HyperParameters from the input params dict.\n", + " lr = float(parameters[\"lr\"])\n", + " momentum = float(parameters[\"momentum\"])\n", + " batch_size = int(parameters[\"batch_size\"])\n", + " num_epoch = int(parameters[\"num_epoch\"])\n", + " log_interval = int(parameters[\"log_interval\"])\n", + "\n", + " # Prepare MNIST Dataset.\n", + " def mnist_train_dataset(batch_size):\n", + " return torch.utils.data.DataLoader(\n", + " datasets.FashionMNIST(\n", + " \"./data\",\n", + " train=True,\n", + " download=True,\n", + " transform=transforms.Compose([transforms.ToTensor()]),\n", + " ),\n", + " batch_size=batch_size,\n", + " shuffle=True,\n", + " )\n", + "\n", + " def mnist_test_dataset(batch_size):\n", + " return torch.utils.data.DataLoader(\n", + " datasets.FashionMNIST(\n", + " \"./data\", train=False, transform=transforms.Compose([transforms.ToTensor()])\n", + " ),\n", + " batch_size=batch_size,\n", + " shuffle=False,\n", + " )\n", + " \n", + " # Build CNN Model.\n", + " def build_and_compile_cnn_model():\n", + " return torch.nn.Sequential(\n", + " torch.nn.Conv2d(1, 20, 5, 1),\n", + " torch.nn.ReLU(),\n", + " torch.nn.MaxPool2d(2, 2),\n", + " \n", + " torch.nn.Conv2d(20, 50, 5, 1),\n", + " torch.nn.ReLU(),\n", + " torch.nn.MaxPool2d(2, 2),\n", + " \n", + " torch.nn.Flatten(),\n", + " \n", + " torch.nn.Linear(4 * 4 * 50, 500),\n", + " torch.nn.ReLU(),\n", + " \n", + " torch.nn.Linear(500, 10),\n", + " torch.nn.LogSoftmax(dim=1)\n", + " )\n", + " \n", + " # Train CNN Model.\n", + " def train_cnn_model(model, train_loader, optimizer, epoch):\n", + " model.train()\n", + " for batch_idx, (data, target) in enumerate(train_loader):\n", + " optimizer.zero_grad()\n", + " output = model(data)\n", + " loss = torch.nn.functional.nll_loss(output, target)\n", + " loss.backward()\n", + " optimizer.step()\n", + " if batch_idx % log_interval == 0:\n", + " msg = \"Train Epoch: {} [{}/{} ({:.0f}%)]\\tloss={:.4f}\".format(\n", + " epoch,\n", + " batch_idx * len(data),\n", + " len(train_loader.dataset),\n", + " 100.0 * batch_idx / len(train_loader),\n", + " loss.item(),\n", + " )\n", + " logging.info(msg)\n", + " \n", + " # Test CNN Model and report training metrics\n", + " def test_cnn_model(model, test_loader):\n", + " model.eval()\n", + " test_loss = 0\n", + " correct = 0\n", + " with torch.no_grad():\n", + " for data, target in test_loader:\n", + " output = model(data)\n", + " test_loss += torch.nn.functional.nll_loss(\n", + " output, target, reduction=\"sum\"\n", + " ).item() # sum up batch loss\n", + " pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability\n", + " correct += pred.eq(target.view_as(pred)).sum().item()\n", + " \n", + " test_loss /= len(test_loader.dataset)\n", + " test_accuracy = float(correct) / len(test_loader.dataset)\n", + " katib.report_metrics({ # report metrics directly without outputing logs\n", + " \"accuracy\": test_accuracy, \n", + " \"loss\": test_loss,\n", + " })\n", + "\n", + " # Download dataset and construct loaders for training and testing\n", + " train_loader = mnist_train_dataset(batch_size)\n", + " test_loader = mnist_test_dataset(batch_size)\n", + "\n", + " # Build Model and Optimizer\n", + " model = build_and_compile_cnn_model()\n", + " optimizer = torch.optim.SGD(model.parameters(), lr, momentum)\n", + "\n", + " # Train Model and report metrics\n", + " for epoch_idx in range(1, num_epoch + 1):\n", + " train_cnn_model(model, train_loader, optimizer, epoch_idx)\n", + " test_cnn_model(model, test_loader)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Start Model Tuning with Katib\n", + "\n", + "If you want to improve your model, you can run HyperParameter tuning with Katib.\n", + "\n", + "The following example uses **Random Search** algorithm to tune HyperParameters.\n", + "\n", + "We are going to tune `learning rate` and `momentum`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import kubeflow.katib as katib\n", + "\n", + "# Set parameters with their distribution for HyperParameter Tuning with Katib.\n", + "parameters = {\n", + " \"lr\": katib.search.double(min=0.01, max=0.03),\n", + " \"momentum\": katib.search.double(min=0.3, max=0.7),\n", + " \"num_epoch\": 1,\n", + " \"batch_size\": 64,\n", + " \"log_interval\": 10\n", + "}\n", + "\n", + "# Start the Katib Experiment.\n", + "# TODO (Electronic-Waste): \n", + "# 1. Change `kubeflow-katib` to release version when `0.18.0` is ready.\n", + "# 2. Change `base_image` to official image when `kubeflow-katib` release version `0.18.0` is ready.\n", + "exp_name = \"tune-mnist\"\n", + "katib_client = katib.KatibClient(namespace=\"kubeflow\")\n", + "\n", + "katib_client.tune(\n", + " name=exp_name,\n", + " objective=train_mnist_model, # Objective function.\n", + " base_image=\"docker.io/electronicwaste/pytorch:gitv1\",\n", + " parameters=parameters, # HyperParameters to tune.\n", + " algorithm_name=\"random\", # Alorithm to use.\n", + " objective_metric_name=\"accuracy\", # Katib is going to optimize \"accuracy\".\n", + " additional_metric_names=[\"loss\"], # Katib is going to collect these metrics in addition to the objective metric.\n", + " max_trial_count=12, # Trial Threshold.\n", + " parallel_trial_count=2,\n", + " packages_to_install=[\"git+https://github.com/kubeflow/katib.git@master#subdirectory=sdk/python/v1beta1\"],\n", + " metrics_collector_config={\"kind\": \"Push\"},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Access to Katib UI\n", + "\n", + "You can check created experiment in the Katib UI.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Get the Best HyperParameters from the Katib Experiment\n", + "\n", + "You can get the best HyperParameters from the most optimal Katib Trial." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Katib Experiment is Succeeded: True\n", + "\n", + "Current Optimal Trial:\n", + "{'best_trial_name': 'tune-mnist-xqwfhr9w',\n", + " 'observation': {'metrics': [{'latest': '0.8276',\n", + " 'max': '0.8276',\n", + " 'min': '0.8276',\n", + " 'name': 'accuracy'},\n", + " {'latest': '0.48769191679954527',\n", + " 'max': '0.48769191679954527',\n", + " 'min': '0.48769191679954527',\n", + " 'name': 'loss'}]},\n", + " 'parameter_assignments': [{'name': 'lr', 'value': '0.024527727574297616'},\n", + " {'name': 'momentum', 'value': '0.6490973329748595'}]}\n" + ] + } + ], + "source": [ + "status = katib_client.is_experiment_succeeded(exp_name)\n", + "print(f\"Katib Experiment is Succeeded: {status}\\n\")\n", + "\n", + "best_hps = katib_client.get_optimal_hyperparameters(exp_name)\n", + "print(f\"Current Optimal Trial:\\n{best_hps}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Delete Katib Experiment\n", + "\n", + "When jobs are finished, you can delete the resources." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "katib_client.delete_experiment(exp_name)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "katib", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}