From 2c0ce60134fe07d8d9a7edb212d87f1f5bdee6b1 Mon Sep 17 00:00:00 2001 From: Yehudit Kerido Date: Mon, 2 Dec 2024 14:52:11 +0200 Subject: [PATCH] Add wait for experiment success condition and parameterized ns Signed-off-by: Yehudit Kerido --- ...cmaes-and-resume-policies-checkpoint.ipynb | 1008 +++++++++++++++++ .../tune-train-from-func-checkpoint.ipynb | 606 ++++++++++ .../v1beta1/sdk/tune-train-from-func.ipynb | 406 ++----- 3 files changed, 1709 insertions(+), 311 deletions(-) create mode 100644 examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb create mode 100644 examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb diff --git a/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb b/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb new file mode 100644 index 00000000000..72445e7390f --- /dev/null +++ b/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb @@ -0,0 +1,1008 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "editable": true, + "pycharm": { + "name": "#%% md\n" + }, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# HyperParameter tunning using CMA-ES\n", + "\n", + "In this example you will deploy 3 Katib Experiments with Covariance Matrix Adaptation Evolution Strategy (CMA-ES) using Jupyter Notebook and Katib SDK. These Experiments have various resume policies.\n", + "\n", + "Reference documentation:\n", + "- https://www.kubeflow.org/docs/components/katib/experiment/#cmaes\n", + "- https://www.kubeflow.org/docs/components/katib/resume-experiment/\n", + "\n", + "The notebook shows how to create, get, check status and delete an Experiment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Install Katib SDK\n", + "\n", + "You need to install Katib SDK to run this Notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n", + "!pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Import required packages" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import copy\n", + "\n", + "from kubeflow.katib import KatibClient\n", + "from kubernetes.client import V1ObjectMeta\n", + "from kubeflow.katib import V1beta1Experiment\n", + "from kubeflow.katib import V1beta1AlgorithmSpec\n", + "from kubeflow.katib import V1beta1ObjectiveSpec\n", + "from kubeflow.katib import V1beta1FeasibleSpace\n", + "from kubeflow.katib import V1beta1ExperimentSpec\n", + "from kubeflow.katib import V1beta1ObjectiveSpec\n", + "from kubeflow.katib import V1beta1ParameterSpec\n", + "from kubeflow.katib import V1beta1TrialTemplate\n", + "from kubeflow.katib import V1beta1TrialParameterSpec" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Define your Experiment\n", + "\n", + "You have to create your Experiment object before deploying it. This Experiment is similar to [this](https://github.com/kubeflow/katib/blob/master/examples/v1beta1/hp-tuning/cma-es.yaml) example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "editable": true, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# Experiment name and namespace.\n", + "namespace = \"kubeflow\"\n", + "experiment_name = \"cmaes-example\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "\n", + "metadata = V1ObjectMeta(\n", + " name=experiment_name,\n", + " namespace=namespace\n", + ")\n", + "\n", + "# Algorithm specification.\n", + "algorithm_spec=V1beta1AlgorithmSpec(\n", + " algorithm_name=\"cmaes\"\n", + ")\n", + "\n", + "# Objective specification.\n", + "objective_spec=V1beta1ObjectiveSpec(\n", + " type=\"minimize\",\n", + " goal= 0.001,\n", + " objective_metric_name=\"loss\",\n", + ")\n", + "\n", + "# Experiment search space. In this example we tune learning rate, number of layer and optimizer.\n", + "parameters=[\n", + " V1beta1ParameterSpec(\n", + " name=\"lr\",\n", + " parameter_type=\"double\",\n", + " feasible_space=V1beta1FeasibleSpace(\n", + " min=\"0.01\",\n", + " max=\"0.06\"\n", + " ),\n", + " ),\n", + " V1beta1ParameterSpec(\n", + " name=\"momentum\",\n", + " parameter_type=\"double\",\n", + " feasible_space=V1beta1FeasibleSpace(\n", + " min=\"0.5\",\n", + " max=\"0.9\"\n", + " ),\n", + " ),\n", + "]\n", + "\n", + "# JSON template specification for the Trial's Worker Kubernetes Job.\n", + "trial_spec={\n", + " \"apiVersion\": \"batch/v1\",\n", + " \"kind\": \"Job\",\n", + " \"spec\": {\n", + " \"template\": {\n", + " \"metadata\": {\n", + " \"annotations\": {\n", + " \"sidecar.istio.io/inject\": \"false\"\n", + " }\n", + " },\n", + " \"spec\": {\n", + " \"containers\": [\n", + " {\n", + " \"name\": \"training-container\",\n", + " \"image\": \"docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0\",\n", + " \"command\": [\n", + " \"python3\",\n", + " \"/opt/pytorch-mnist/mnist.py\",\n", + " \"--epochs=1\",\n", + " \"--batch-size=64\",\n", + " \"--lr=${trialParameters.learningRate}\",\n", + " \"--momentum=${trialParameters.momentum}\",\n", + " ]\n", + " }\n", + " ],\n", + " \"restartPolicy\": \"Never\"\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "# Configure parameters for the Trial template.\n", + "trial_template=V1beta1TrialTemplate(\n", + " primary_container_name=\"training-container\",\n", + " trial_parameters=[\n", + " V1beta1TrialParameterSpec(\n", + " name=\"learningRate\",\n", + " description=\"Learning rate for the training model\",\n", + " reference=\"lr\"\n", + " ),\n", + " V1beta1TrialParameterSpec(\n", + " name=\"momentum\",\n", + " description=\"Momentum for the training model\",\n", + " reference=\"momentum\"\n", + " ),\n", + " ],\n", + " trial_spec=trial_spec\n", + ")\n", + "\n", + "\n", + "# Experiment object.\n", + "experiment = V1beta1Experiment(\n", + " api_version=\"kubeflow.org/v1beta1\",\n", + " kind=\"Experiment\",\n", + " metadata=metadata,\n", + " spec=V1beta1ExperimentSpec(\n", + " max_trial_count=3,\n", + " parallel_trial_count=2,\n", + " max_failed_trial_count=1,\n", + " algorithm=algorithm_spec,\n", + " objective=objective_spec,\n", + " parameters=parameters,\n", + " trial_template=trial_template,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "experiment_never_resume_name = \"never-resume-cmaes\"\n", + "experiment_from_volume_resume_name = \"from-volume-resume-cmaes\"\n", + "\n", + "# Create new Experiments from the previous Experiment info.\n", + "# Define Experiment with Never resume.\n", + "experiment_never_resume = copy.deepcopy(experiment)\n", + "experiment_never_resume.metadata.name = experiment_never_resume_name\n", + "experiment_never_resume.spec.resume_policy = \"Never\"\n", + "experiment_never_resume.spec.max_trial_count = 4\n", + "\n", + "# Define Experiment with FromVolume resume.\n", + "experiment_from_volume_resume = copy.deepcopy(experiment)\n", + "experiment_from_volume_resume.metadata.name = experiment_from_volume_resume_name\n", + "experiment_from_volume_resume.spec.resume_policy = \"FromVolume\"\n", + "experiment_from_volume_resume.spec.max_trial_count = 4" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "You can print the Experiment's info to verify it before submission." + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cmaes-example\n", + "cmaes\n", + "-----------------\n", + "never-resume-cmaes\n", + "Never\n", + "-----------------\n", + "from-volume-resume-cmaes\n", + "FromVolume\n" + ] + } + ], + "source": [ + "print(experiment.metadata.name)\n", + "print(experiment.spec.algorithm.algorithm_name)\n", + "print(\"-----------------\")\n", + "print(experiment_never_resume.metadata.name)\n", + "print(experiment_never_resume.spec.resume_policy)\n", + "print(\"-----------------\")\n", + "print(experiment_from_volume_resume.metadata.name)\n", + "print(experiment_from_volume_resume.spec.resume_policy)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Create your Experiment\n", + "\n", + "You have to create Katib client to use the SDK." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment kubeflow-user-example-com/cmaes-example has been created\n" + ] + }, + { + "data": { + "text/html": [ + "Katib Experiment cmaes-example link here" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Initialize KatibClient\n", + "kclient = KatibClient(namespace=namespace)\n", + "\n", + "# Create your Experiment.\n", + "kclient.create_experiment(experiment,namespace=namespace)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Create other Experiments" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment kubeflow-user-example-com/never-resume-cmaes has been created\n" + ] + }, + { + "data": { + "text/html": [ + "Katib Experiment never-resume-cmaes link here" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment kubeflow-user-example-com/from-volume-resume-cmaes has been created\n" + ] + }, + { + "data": { + "text/html": [ + "Katib Experiment from-volume-resume-cmaes link here" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Create Experiment with never resume.\n", + "kclient.create_experiment(experiment_never_resume,namespace=namespace)\n", + "# Create Experiment with from volume resume.\n", + "kclient.create_experiment(experiment_from_volume_resume,namespace=namespace)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Get your Experiment\n", + "\n", + "You can get your Experiment by name and receive required data." + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'api_version': 'kubeflow.org/v1beta1',\n", + " 'kind': 'Experiment',\n", + " 'metadata': {'annotations': None,\n", + " 'creation_timestamp': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", + " 'deletion_grace_period_seconds': None,\n", + " 'deletion_timestamp': None,\n", + " 'finalizers': ['update-prometheus-metrics'],\n", + " 'generate_name': None,\n", + " 'generation': 1,\n", + " 'labels': None,\n", + " 'managed_fields': [{'api_version': 'kubeflow.org/v1beta1',\n", + " 'fields_type': 'FieldsV1',\n", + " 'fields_v1': {'f:spec': {'.': {},\n", + " 'f:algorithm': {'.': {},\n", + " 'f:algorithmName': {}},\n", + " 'f:maxFailedTrialCount': {},\n", + " 'f:maxTrialCount': {},\n", + " 'f:objective': {'.': {},\n", + " 'f:additionalMetricNames': {},\n", + " 'f:goal': {},\n", + " 'f:objectiveMetricName': {},\n", + " 'f:type': {}},\n", + " 'f:parallelTrialCount': {},\n", + " 'f:parameters': {},\n", + " 'f:trialTemplate': {'.': {},\n", + " 'f:primaryContainerName': {},\n", + " 'f:trialParameters': {},\n", + " 'f:trialSpec': {'.': {},\n", + " 'f:apiVersion': {},\n", + " 'f:kind': {},\n", + " 'f:spec': {'.': {},\n", + " 'f:template': {'.': {},\n", + " 'f:metadata': {'.': {},\n", + " 'f:annotations': {'.': {},\n", + " 'f:sidecar.istio.io/inject': {}}},\n", + " 'f:spec': {'.': {},\n", + " 'f:containers': {},\n", + " 'f:restartPolicy': {}}}}}}}},\n", + " 'manager': 'OpenAPI-Generator',\n", + " 'operation': 'Update',\n", + " 'subresource': None,\n", + " 'time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal())},\n", + " {'api_version': 'kubeflow.org/v1beta1',\n", + " 'fields_type': 'FieldsV1',\n", + " 'fields_v1': {'f:metadata': {'f:finalizers': {'.': {},\n", + " 'v:\"update-prometheus-metrics\"': {}}}},\n", + " 'manager': 'katib-controller',\n", + " 'operation': 'Update',\n", + " 'subresource': None,\n", + " 'time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal())},\n", + " {'api_version': 'kubeflow.org/v1beta1',\n", + " 'fields_type': 'FieldsV1',\n", + " 'fields_v1': {'f:status': {'.': {},\n", + " 'f:conditions': {},\n", + " 'f:currentOptimalTrial': {'.': {},\n", + " 'f:observation': {}},\n", + " 'f:runningTrialList': {},\n", + " 'f:startTime': {},\n", + " 'f:trials': {},\n", + " 'f:trialsRunning': {}}},\n", + " 'manager': 'katib-controller',\n", + " 'operation': 'Update',\n", + " 'subresource': 'status',\n", + " 'time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal())}],\n", + " 'name': 'cmaes-example',\n", + " 'namespace': 'kubeflow-user-example-com',\n", + " 'owner_references': None,\n", + " 'resource_version': '26516',\n", + " 'self_link': None,\n", + " 'uid': '1d59819e-4e5f-4adc-90cc-62c2ee867f72'},\n", + " 'spec': {'algorithm': {'algorithm_name': 'cmaes', 'algorithm_settings': None},\n", + " 'early_stopping': None,\n", + " 'max_failed_trial_count': 1,\n", + " 'max_trial_count': 3,\n", + " 'metrics_collector_spec': {'collector': {'custom_collector': None,\n", + " 'kind': 'StdOut'},\n", + " 'source': None},\n", + " 'nas_config': None,\n", + " 'objective': {'additional_metric_names': ['Train-accuracy'],\n", + " 'goal': 0.99,\n", + " 'metric_strategies': [{'name': 'Validation-accuracy',\n", + " 'value': 'max'},\n", + " {'name': 'Train-accuracy',\n", + " 'value': 'max'}],\n", + " 'objective_metric_name': 'Validation-accuracy',\n", + " 'type': 'maximize'},\n", + " 'parallel_trial_count': 2,\n", + " 'parameters': [{'feasible_space': {'list': None,\n", + " 'max': '0.06',\n", + " 'min': '0.01',\n", + " 'step': None},\n", + " 'name': 'lr',\n", + " 'parameter_type': 'double'},\n", + " {'feasible_space': {'list': None,\n", + " 'max': '5',\n", + " 'min': '2',\n", + " 'step': None},\n", + " 'name': 'num-layers',\n", + " 'parameter_type': 'int'},\n", + " {'feasible_space': {'list': ['sgd', 'adam', 'ftrl'],\n", + " 'max': None,\n", + " 'min': None,\n", + " 'step': None},\n", + " 'name': 'optimizer',\n", + " 'parameter_type': 'categorical'}],\n", + " 'resume_policy': 'LongRunning',\n", + " 'trial_template': {'config_map': None,\n", + " 'failure_condition': 'status.conditions.#(type==\"Failed\")#|#(status==\"True\")#',\n", + " 'primary_container_name': 'training-container',\n", + " 'primary_pod_labels': None,\n", + " 'retain': None,\n", + " 'success_condition': 'status.conditions.#(type==\"Complete\")#|#(status==\"True\")#',\n", + " 'trial_parameters': [{'description': 'Learning '\n", + " 'rate for '\n", + " 'the '\n", + " 'training '\n", + " 'model',\n", + " 'name': 'learningRate',\n", + " 'reference': 'lr'},\n", + " {'description': 'Number of '\n", + " 'training '\n", + " 'model '\n", + " 'layers',\n", + " 'name': 'numberLayers',\n", + " 'reference': 'num-layers'},\n", + " {'description': 'Training '\n", + " 'model '\n", + " 'optimizer '\n", + " '(sdg, adam '\n", + " 'or ftrl)',\n", + " 'name': 'optimizer',\n", + " 'reference': 'optimizer'}],\n", + " 'trial_spec': {'apiVersion': 'batch/v1',\n", + " 'kind': 'Job',\n", + " 'spec': {'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n", + " 'spec': {'containers': [{'command': ['python3',\n", + " '/opt/mxnet-mnist/mnist.py',\n", + " '--batch-size=64',\n", + " '--num-epochs=1',\n", + " '--lr=${trialParameters.learningRate}',\n", + " '--num-layers=${trialParameters.numberLayers}',\n", + " '--optimizer=${trialParameters.optimizer}'],\n", + " 'image': 'docker.io/kubeflowkatib/mxnet-mnist:v0.14.0',\n", + " 'name': 'training-container'}],\n", + " 'restartPolicy': 'Never'}}}}}},\n", + " 'status': {'completion_time': None,\n", + " 'conditions': [{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", + " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", + " 'message': 'Experiment is created',\n", + " 'reason': 'ExperimentCreated',\n", + " 'status': 'True',\n", + " 'type': 'Created'},\n", + " {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", + " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", + " 'message': 'Experiment is running',\n", + " 'reason': 'ExperimentRunning',\n", + " 'status': 'True',\n", + " 'type': 'Running'}],\n", + " 'current_optimal_trial': {'best_trial_name': None,\n", + " 'observation': {'metrics': None},\n", + " 'parameter_assignments': None},\n", + " 'early_stopped_trial_list': None,\n", + " 'failed_trial_list': None,\n", + " 'killed_trial_list': None,\n", + " 'last_reconcile_time': None,\n", + " 'metrics_unavailable_trial_list': None,\n", + " 'pending_trial_list': None,\n", + " 'running_trial_list': ['cmaes-example-f64n8vb5',\n", + " 'cmaes-example-l6zkx5jx'],\n", + " 'start_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", + " 'succeeded_trial_list': None,\n", + " 'trial_metrics_unavailable': None,\n", + " 'trials': 2,\n", + " 'trials_early_stopped': None,\n", + " 'trials_failed': None,\n", + " 'trials_killed': None,\n", + " 'trials_pending': None,\n", + " 'trials_running': 2,\n", + " 'trials_succeeded': None}}\n", + "-----------------\n", + "\n", + "3\n", + "{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", + " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", + " 'message': 'Experiment is running',\n", + " 'reason': 'ExperimentRunning',\n", + " 'status': 'True',\n", + " 'type': 'Running'}\n" + ] + } + ], + "source": [ + "exp = kclient.get_experiment(name=experiment_name, namespace=namespace)\n", + "print(exp)\n", + "print(\"-----------------\\n\")\n", + "\n", + "# Get the max trial count and latest status.\n", + "print(exp.spec.max_trial_count)\n", + "print(exp.status.conditions[-1])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Get all Experiments\n", + "\n", + "You can get list of the current Experiments." + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "cmaes-example\n", + "from-volume-resume-cmaes\n", + "never-resume-cmaes\n" + ] + } + ], + "source": [ + "# Get names from the running Experiments.\n", + "exp_list = kclient.list_experiments(namespace=namespace)\n", + "\n", + "for exp in exp_list:\n", + " print(exp.metadata.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Get the current Experiment conditions\n", + "\n", + "You can check the current Experiment conditions and check if Experiment is Succeeded." + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", + " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", + " 'message': 'Experiment is created',\n", + " 'reason': 'ExperimentCreated',\n", + " 'status': 'True',\n", + " 'type': 'Created'},\n", + " {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", + " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", + " 'message': 'Experiment is running',\n", + " 'reason': 'ExperimentRunning',\n", + " 'status': 'True',\n", + " 'type': 'Running'}]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kclient.get_experiment_conditions(name=experiment_name, namespace=namespace)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kclient.is_experiment_succeeded(name=experiment_name, namespace=namespace)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## List of the current Trials\n", + "\n", + "You can get list of the current Trials with the latest status." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Trial Name: cmaes-example-dd4x6tsh\n", + "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n", + " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n", + " 'message': 'Trial is running',\n", + " 'reason': 'TrialRunning',\n", + " 'status': 'True',\n", + " 'type': 'Running'}\n", + "\n", + "Trial Name: cmaes-example-f64n8vb5\n", + "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n", + " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n", + " 'message': 'Trial has succeeded',\n", + " 'reason': 'TrialSucceeded',\n", + " 'status': 'True',\n", + " 'type': 'Succeeded'}\n", + "\n", + "Trial Name: cmaes-example-l6zkx5jx\n", + "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 45, tzinfo=tzlocal()),\n", + " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 45, tzinfo=tzlocal()),\n", + " 'message': 'Trial has succeeded',\n", + " 'reason': 'TrialSucceeded',\n", + " 'status': 'True',\n", + " 'type': 'Succeeded'}\n" + ] + } + ], + "source": [ + "# Trial list.\n", + "trial_list = kclient.list_trials(experiment_name=experiment_name, namespace=namespace)\n", + "for trial in trial_list:\n", + " print(f\"Trial Name: {trial.metadata.name}\")\n", + " print(f\"Trial Status: {trial.status.conditions[-1]}\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Get the optimal HyperParameters\n", + "\n", + "You can get the current optimal Trial from your Experiment. For the each metric you can see the max, min and latest value." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'best_trial_name': 'cmaes-example-l6zkx5jx',\n", + " 'observation': {'metrics': [{'latest': '0.955613',\n", + " 'max': '0.955613',\n", + " 'min': '0.955613',\n", + " 'name': 'Validation-accuracy'},\n", + " {'latest': '0.922775',\n", + " 'max': '0.922775',\n", + " 'min': '0.922775',\n", + " 'name': 'Train-accuracy'}]},\n", + " 'parameter_assignments': [{'name': 'lr', 'value': '0.04511033252270099'},\n", + " {'name': 'num-layers', 'value': '3'},\n", + " {'name': 'optimizer', 'value': 'sgd'}]}" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Optimal HPs.\n", + "kclient.get_optimal_hyperparameters(name=experiment_name, namespace=namespace)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Status for the Suggestion objects\n", + "\n", + "Once Experiment is Succeeded, you can check the Suggestion object status for more information about resume status.\n", + "\n", + "For Experiment with FromVolume you should be able to check created PVC." + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Suggestion is succeeded, can't be restarted\n", + "-----------------\n", + "Suggestion is succeeded, suggestion volume is not deleted, can be restarted\n" + ] + } + ], + "source": [ + "# Get the current Suggestion status for the never resume Experiment.\n", + "suggestion = kclient.get_suggestion(name=experiment_never_resume_name, namespace=namespace)\n", + "\n", + "print(suggestion.status.conditions[-1].message)\n", + "print(\"-----------------\")\n", + "\n", + "# Get the current Suggestion status for the from volume Experiment.\n", + "suggestion = kclient.get_suggestion(name=experiment_from_volume_resume_name, namespace=namespace)\n", + "\n", + "print(suggestion.status.conditions[-1].message)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Delete your Experiments\n", + "\n", + "You can delete your Experiments." + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment kubeflow-user-example-com/cmaes-example has been deleted\n", + "Experiment kubeflow-user-example-com/never-resume-cmaes has been deleted\n", + "Experiment kubeflow-user-example-com/from-volume-resume-cmaes has been deleted\n" + ] + } + ], + "source": [ + "kclient.delete_experiment(name=experiment_name, namespace=namespace)\n", + "kclient.delete_experiment(name=experiment_never_resume_name, namespace=namespace)\n", + "kclient.delete_experiment(name=experiment_from_volume_resume_name, namespace=namespace)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb b/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb new file mode 100644 index 00000000000..7152602e595 --- /dev/null +++ b/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb @@ -0,0 +1,606 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bf9ab16d-fbf6-4385-a7f8-133e4562e1e7", + "metadata": { + "tags": [] + }, + "source": [ + "# Tune and Train with Kubeflow Katib and Training Operator\n", + " \n", + "In this Notebook we are going to do the following:\n", + "\n", + "- Train Tensorflow model using Kubeflow Notebook.\n", + "- Improve the model HyperParameters with [Kubeflow Katib](https://www.kubeflow.org/docs/components/katib/overview/).\n", + "- Use [Multi Worker Mirrored Strategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) to distributively train the model with [Kubeflow TFJob](https://www.kubeflow.org/docs/components/training/tftraining/)." + ] + }, + { + "cell_type": "markdown", + "id": "62d91e3d-904a-4a3c-b4e7-573324ba625e", + "metadata": {}, + "source": [ + "## Install Kubeflow Python SDKs\n", + "\n", + "You need to install Tensorflow package and Kubeflow SDKs to run this Notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c1a4c93", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!pip install tensorflow==2.16.1\n", + "\n", + "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n", + "!pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1\n", + "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b52ba56d-7f61-44b5-90d6-f7e3de79f596", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# Experiment namespace\n", + "namespace = \"default\" " + ] + }, + { + "cell_type": "markdown", + "id": "11835208", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Create Train Script for CNN Model\n", + "\n", + "This is simple **Convolutional Neural Network (CNN)** model for recognizing hand-written digits using [MNIST Dataset](http://yann.lecun.com/exdb/mnist/). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e1b7f76", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def train_mnist_model(parameters):\n", + " import tensorflow as tf\n", + " import numpy as np\n", + " import logging\n", + "\n", + " logging.basicConfig(\n", + " format=\"%(asctime)s %(levelname)-8s %(message)s\",\n", + " datefmt=\"%Y-%m-%dT%H:%M:%SZ\",\n", + " level=logging.INFO,\n", + " )\n", + " logging.info(\"--------------------------------------------------------------------------------------\")\n", + " logging.info(f\"Input Parameters: {parameters}\")\n", + " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n", + "\n", + "\n", + " # Get HyperParameters from the input params dict.\n", + " lr = float(parameters[\"lr\"])\n", + " num_epoch = int(parameters[\"num_epoch\"])\n", + "\n", + " # Set dist parameters and strategy.\n", + " is_dist = parameters[\"is_dist\"]\n", + " num_workers = parameters[\"num_workers\"]\n", + " batch_size_per_worker = 64\n", + " batch_size_global = batch_size_per_worker * num_workers\n", + " strategy = tf.distribute.MultiWorkerMirroredStrategy(\n", + " communication_options=tf.distribute.experimental.CommunicationOptions(\n", + " implementation=tf.distribute.experimental.CollectiveCommunication.RING\n", + " )\n", + " )\n", + "\n", + " # Callback class for logging training.\n", + " # Katib parses metrics in this format: =.\n", + " class CustomCallback(tf.keras.callbacks.Callback):\n", + " def on_epoch_end(self, epoch, logs=None):\n", + " logging.info(\n", + " \"Epoch {}/{}. accuracy={:.4f} - loss={:.4f}\".format(\n", + " epoch+1, num_epoch, logs[\"accuracy\"], logs[\"loss\"]\n", + " )\n", + " )\n", + "\n", + " # Prepare MNIST Dataset.\n", + " def mnist_dataset(batch_size):\n", + " (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()\n", + " x_train = x_train / np.float32(255)\n", + " y_train = y_train.astype(np.int64)\n", + " train_dataset = (\n", + " tf.data.Dataset.from_tensor_slices((x_train, y_train))\n", + " .shuffle(60000)\n", + " .repeat()\n", + " .batch(batch_size)\n", + " )\n", + " return train_dataset\n", + "\n", + " # Build and compile CNN Model.\n", + " def build_and_compile_cnn_model():\n", + " model = tf.keras.Sequential(\n", + " [\n", + " tf.keras.layers.InputLayer(input_shape=(28, 28)),\n", + " tf.keras.layers.Reshape(target_shape=(28, 28, 1)),\n", + " tf.keras.layers.Conv2D(32, 3, activation=\"relu\"),\n", + " tf.keras.layers.Flatten(),\n", + " tf.keras.layers.Dense(128, activation=\"relu\"),\n", + " tf.keras.layers.Dense(10),\n", + " ]\n", + " )\n", + " model.compile(\n", + " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", + " optimizer=tf.keras.optimizers.SGD(learning_rate=lr),\n", + " metrics=[\"accuracy\"],\n", + " )\n", + " return model\n", + " \n", + " # Download Dataset.\n", + " dataset = mnist_dataset(batch_size_global)\n", + "\n", + " # For dist strategy we should build model under scope().\n", + " if is_dist:\n", + " logging.info(\"Running Distributed Training\")\n", + " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n", + " with strategy.scope():\n", + " model = build_and_compile_cnn_model()\n", + " else:\n", + " logging.info(\"Running Single Worker Training\")\n", + " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n", + " model = build_and_compile_cnn_model()\n", + " \n", + " # Start Training.\n", + " model.fit(\n", + " dataset,\n", + " epochs=num_epoch,\n", + " steps_per_epoch=70,\n", + " callbacks=[CustomCallback()],\n", + " verbose=0,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "f6f5db0c", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Run Training Locally in the Notebook\n", + "\n", + "We are going to download MNIST Dataset and start local training.\n", + "\n", + "Also, set `Epochs = 2` to reduce training time and avoid CPU overload. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ba6ea6b4", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# Set Parameters for Local Training.\n", + "parameters = {\n", + " \"lr\": \"0.1\",\n", + " \"num_epoch\": \"2\",\n", + " \"is_dist\": False,\n", + " \"num_workers\": 1\n", + "}\n", + "\n", + "# Train Model locally in the Notebook.\n", + "train_mnist_model(parameters)" + ] + }, + { + "cell_type": "markdown", + "id": "23e12963", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Start Model Tuning with Katib\n", + "\n", + "If you want to improve your model, you can run HyperParameter tuning with Katib.\n", + "\n", + "The following example uses **Covariance Matrix Adaptation Evolution Strategy (CMA-ES)** algorithm to tune HyperParameters.\n", + "\n", + "We are going to tune `learning rate` and `number of epochs`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "25599637", + "metadata": { + "collapsed": false, + "editable": true, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import kubeflow.katib as katib\n", + "\n", + "# Set parameters with their distribution for HyperParameter Tuning with Katib.\n", + "parameters = {\n", + " \"lr\": katib.search.double(min=0.1, max=0.2),\n", + " \"num_epoch\": katib.search.int(min=10, max=15),\n", + " \"is_dist\": False,\n", + " \"num_workers\": 1\n", + "}\n", + "\n", + "# Start the Katib Experiment.\n", + "exp_name = \"tune-mnist\"\n", + "katib_client = katib.KatibClient(namespace=namespace)\n", + "\n", + "katib_client.tune(\n", + " name=exp_name,\n", + " objective=train_mnist_model, # Objective function.\n", + " parameters=parameters, # HyperParameters to tune.\n", + " algorithm_name=\"cmaes\", # Alorithm to use.\n", + " objective_metric_name=\"accuracy\", # Katib is going to optimize \"accuracy\".\n", + " additional_metric_names=[\"loss\"], # Katib is going to collect these metrics in addition to the objective metric.\n", + " max_trial_count=12, # Trial Threshold.\n", + " parallel_trial_count=2,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f01b7f6d", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Access to Katib UI\n", + "\n", + "You can check created Experiment in the Katib UI.\n", + "\n", + "![Screenshot 2022-09-12 at 20.06.23.png](attachment:cdaf463d-28b3-4a98-bb4c-9613ca1bfa50.png)" + ] + }, + { + "cell_type": "markdown", + "id": "c35b8d00", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Get the Best HyperParameters from the Katib Experiment\n", + "\n", + "You can get the best HyperParameters from the most optimal Katib Trial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c7c92b8", + "metadata": { + "collapsed": false, + "editable": true, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "katib_client.wait_for_experiment_condition(exp_name, namespace=namespace)\n", + "status = katib_client.is_experiment_succeeded(exp_name, namespace=namespace)\n", + "print(f\"Katib Experiment is Succeeded: {status}\\n\")\n", + "\n", + "best_hps = katib_client.get_optimal_hyperparameters(exp_name, namespace=namespace)\n", + "\n", + "if best_hps != None:\n", + " print(\"Current Optimal Trial\\n\")\n", + " print(best_hps)\n", + " \n", + " for hp in best_hps.parameter_assignments:\n", + " if hp.name == \"lr\":\n", + " best_lr = hp.value\n", + " else:\n", + " best_num_epoch = hp.value" + ] + }, + { + "cell_type": "markdown", + "id": "f1626054", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Start Distributive Training with TFJob\n", + "\n", + "Use the best HyperParameters (`learning rate` and `number of epochs`) from the Katib Experiment and run the TFJob." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04d8bdb0", + "metadata": { + "collapsed": false, + "editable": true, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from kubeflow.training import TrainingClient\n", + "\n", + "# Set Parameters for Distributed Training with TFJob.\n", + "parameters = {\n", + " \"lr\": best_lr,\n", + " \"num_epoch\": best_num_epoch,\n", + " \"is_dist\": True,\n", + " \"num_workers\": 5\n", + "}\n", + "\n", + "# Start TFJob Training.\n", + "tfjob_name = \"train-mnist\"\n", + "tfjob_client = TrainingClient(namespace=namespace)\n", + "\n", + "#create_tfjob_from_func\n", + "tfjob_client.create_job(\n", + " name=tfjob_name,\n", + " namespace=namespace,\n", + " job_kind=\"TFJob\",\n", + " train_func=train_mnist_model,\n", + " parameters=parameters, # Input parameters for the train function.\n", + " num_workers=5, # How many TFJob Workers will be run.\n", + " base_image=\"tensorflow/tensorflow:2.10.0\", # Use TensorFlow image\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "589c6ec1", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "### Get TFJob Status and Training Logs\n", + "\n", + "You can check the TFJob status and logs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aae9701d", + "metadata": { + "collapsed": false, + "editable": true, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "print(f\"TFJob status: {tfjob_client.get_job_conditions(tfjob_name, namespace=namespace, job_kind='TFJob')}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ec788ab", + "metadata": { + "collapsed": false, + "editable": true, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tfjob_client.get_job_logs(name=tfjob_name, namespace=namespace, is_master=True, follow=True, job_kind=\"TFJob\")" + ] + }, + { + "cell_type": "markdown", + "id": "5d4fdc13", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "## Delete Katib Experiment and TFJob\n", + "\n", + "When jobs are finished, you can delete the resources." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7453ca6c", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "katib_client.delete_experiment(exp_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d56c24f", + "metadata": { + "collapsed": false, + "editable": true, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tfjob_client.delete_job(tfjob_name, namespace=namespace, job_kind=\"TFJob\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e238a638-cf77-423f-a346-f763fc8b1582", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/v1beta1/sdk/tune-train-from-func.ipynb b/examples/v1beta1/sdk/tune-train-from-func.ipynb index d42cbad4312..a0b7426820b 100644 --- a/examples/v1beta1/sdk/tune-train-from-func.ipynb +++ b/examples/v1beta1/sdk/tune-train-from-func.ipynb @@ -4,13 +4,6 @@ "cell_type": "markdown", "id": "bf9ab16d-fbf6-4385-a7f8-133e4562e1e7", "metadata": { - "editable": true, - "pycharm": { - "name": "#%% md\n" - }, - "slideshow": { - "slide_type": "" - }, "tags": [] }, "source": [ @@ -26,11 +19,7 @@ { "cell_type": "markdown", "id": "62d91e3d-904a-4a3c-b4e7-573324ba625e", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, + "metadata": {}, "source": [ "## Install Kubeflow Python SDKs\n", "\n", @@ -40,20 +29,20 @@ { "cell_type": "code", "execution_count": null, - "id": "5de885ca-e96a-4d59-9e78-75f6fc6f5ce7", + "id": "9c1a4c93", "metadata": { - "editable": true, + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%%\n" - }, - "slideshow": { - "slide_type": "" - }, - "tags": [] + } }, "outputs": [], "source": [ "!pip install tensorflow==2.16.1\n", + "\n", "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n", "!pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1\n", "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python" @@ -62,16 +51,9 @@ { "cell_type": "code", "execution_count": null, - "id": "b807dfbb", + "id": "b52ba56d-7f61-44b5-90d6-f7e3de79f596", "metadata": { - "collapsed": false, "editable": true, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - }, "slideshow": { "slide_type": "" }, @@ -81,13 +63,13 @@ }, "outputs": [], "source": [ - "# Experiment namespace.\n", + "# Experiment namespace\n", "namespace = \"default\" " ] }, { "cell_type": "markdown", - "id": "9f319483", + "id": "11835208", "metadata": { "collapsed": false, "jupyter": { @@ -106,7 +88,7 @@ { "cell_type": "code", "execution_count": null, - "id": "727ec914", + "id": "0e1b7f76", "metadata": { "collapsed": false, "jupyter": { @@ -216,7 +198,7 @@ }, { "cell_type": "markdown", - "id": "4b8a6dde", + "id": "f6f5db0c", "metadata": { "collapsed": false, "jupyter": { @@ -237,7 +219,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18e13f06", + "id": "ba6ea6b4", "metadata": { "collapsed": false, "jupyter": { @@ -263,7 +245,7 @@ }, { "cell_type": "markdown", - "id": "e73eb9a5", + "id": "23e12963", "metadata": { "collapsed": false, "jupyter": { @@ -286,15 +268,20 @@ { "cell_type": "code", "execution_count": null, - "id": "841345df", + "id": "25599637", "metadata": { "collapsed": false, + "editable": true, "jupyter": { "outputs_hidden": false }, "pycharm": { "name": "#%%\n" - } + }, + "slideshow": { + "slide_type": "" + }, + "tags": [] }, "outputs": [], "source": [ @@ -326,7 +313,7 @@ }, { "cell_type": "markdown", - "id": "13fb2f5c", + "id": "f01b7f6d", "metadata": { "collapsed": false, "jupyter": { @@ -346,7 +333,7 @@ }, { "cell_type": "markdown", - "id": "f24c9fd7", + "id": "c35b8d00", "metadata": { "collapsed": false, "jupyter": { @@ -365,16 +352,6 @@ { "cell_type": "code", "execution_count": null, - "id": "964e2f4c", - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, "outputs": [], "source": [ "katib_client.wait_for_experiment_condition(exp_name, namespace=namespace)\n", @@ -392,34 +369,32 @@ " best_lr = hp.value\n", " else:\n", " best_num_epoch = hp.value" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "6289e27f-325d-4433-9379-7e97bc8aae69", + ], "metadata": { - "execution": { - "iopub.execute_input": "2022-09-12T19:08:51.055746Z", - "iopub.status.busy": "2022-09-12T19:08:51.054605Z", - "iopub.status.idle": "2022-09-12T19:08:51.246141Z", - "shell.execute_reply": "2022-09-12T19:08:51.244919Z", - "shell.execute_reply.started": "2022-09-12T19:08:51.055713Z" - }, + "collapsed": false, "pycharm": { "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-09-12T19:08:51Z INFO TFJob train-mnist has been created\n" - ] } + } + }, + { + "cell_type": "markdown", + "source": [ + "## Start Distributive Training with TFJob\n", + "\n", + "Use the best HyperParameters (`learning rate` and `number of epochs`) from the Katib Experiment and run the TFJob." ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], "source": [ "from kubeflow.training import TrainingClient\n", "\n", @@ -445,294 +420,103 @@ " num_workers=5, # How many TFJob Workers will be run.\n", " base_image=\"tensorflow/tensorflow:2.10.0\", # Use TensorFlow image\n", ")" - ] - }, - { - "cell_type": "markdown", - "id": "d5d465e8-0310-4c72-ad36-209259ad5c34", + ], "metadata": { + "collapsed": false, "pycharm": { - "name": "#%% md\n" + "name": "#%%\n" } - }, + } + }, + { + "cell_type": "markdown", "source": [ "### Get TFJob Status and Training Logs\n", "\n", "You can check the TFJob status and logs." - ] + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } }, { "cell_type": "code", - "execution_count": 16, - "id": "53859cf4-7a35-4fc4-b5ee-9ba774635df0", + "execution_count": null, + "outputs": [], + "source": [ + "print(f\"TFJob status: {tfjob_client.get_job_conditions(tfjob_name, namespace=namespace, job_kind='TFJob')}\")" + ], "metadata": { - "execution": { - "iopub.execute_input": "2022-09-12T19:10:06.862146Z", - "iopub.status.busy": "2022-09-12T19:10:06.861177Z", - "iopub.status.idle": "2022-09-12T19:10:06.945011Z", - "shell.execute_reply": "2022-09-12T19:10:06.943629Z", - "shell.execute_reply.started": "2022-09-12T19:10:06.862104Z" - }, + "collapsed": false, "pycharm": { "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TFJob status: Succeeded\n" - ] } - ], - "source": [ - "print(f\"TFJob status: {tfjob_client.get_job_conditions(tfjob_name, namespace=namespace, job_kind='TFJob')}\")" - ] + } }, { "cell_type": "code", - "execution_count": 17, - "id": "f247670e-0bd4-4336-a40c-605ce32fad23", + "execution_count": null, + "outputs": [], + "source": [ + "tfjob_client.get_job_logs(name=tfjob_name, namespace=namespace, is_master=True, follow=True, job_kind=\"TFJob\")" + ], "metadata": { - "execution": { - "iopub.execute_input": "2022-09-12T19:10:11.765592Z", - "iopub.status.busy": "2022-09-12T19:10:11.764384Z", - "iopub.status.idle": "2022-09-12T19:10:14.249858Z", - "shell.execute_reply": "2022-09-12T19:10:14.248518Z", - "shell.execute_reply.started": "2022-09-12T19:10:11.765560Z" - }, + "collapsed": false, "pycharm": { "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO --------------------------------------------------------------------------------------\n", - "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO Input Parameters: {'lr': '0.17016692449867332', 'num_epoch': '13', 'is_dist': True, 'num_workers': 5}\n", - "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO --------------------------------------------------------------------------------------\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:53.988515: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.008619: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> train-mnist-worker-0.kubeflow-andrey.svc:2222, 1 -> train-mnist-worker-1.kubeflow-andrey.svc:2222, 2 -> train-mnist-worker-2.kubeflow-andrey.svc:2222, 3 -> train-mnist-worker-3.kubeflow-andrey.svc:2222, 4 -> train-mnist-worker-4.kubeflow-andrey.svc:2222}\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.008700: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> train-mnist-worker-0.kubeflow-andrey.svc:2222, 1 -> train-mnist-worker-1.kubeflow-andrey.svc:2222, 2 -> train-mnist-worker-2.kubeflow-andrey.svc:2222, 3 -> train-mnist-worker-3.kubeflow-andrey.svc:2222, 4 -> train-mnist-worker-4.kubeflow-andrey.svc:2222}\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.009579: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:438] Started server with target: grpc://train-mnist-worker-0.kubeflow-andrey.svc:2222\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:4/device:CPU:0', '/job:worker/replica:0/task:2/device:CPU:0', '/job:worker/replica:0/task:3/device:CPU:0', '/job:worker/replica:0/task:1/device:CPU:0']\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:4/device:CPU:0', '/job:worker/replica:0/task:2/device:CPU:0', '/job:worker/replica:0/task:3/device:CPU:0', '/job:worker/replica:0/task:1/device:CPU:0']\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Check health not enabled.\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO Check health not enabled.\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['train-mnist-worker-0.kubeflow-andrey.svc:2222', 'train-mnist-worker-1.kubeflow-andrey.svc:2222', 'train-mnist-worker-2.kubeflow-andrey.svc:2222', 'train-mnist-worker-3.kubeflow-andrey.svc:2222', 'train-mnist-worker-4.kubeflow-andrey.svc:2222']}, task_type = 'worker', task_id = 0, num_workers = 5, local_devices = ('/job:worker/task:0/device:CPU:0',), communication = CommunicationImplementation.RING\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['train-mnist-worker-0.kubeflow-andrey.svc:2222', 'train-mnist-worker-1.kubeflow-andrey.svc:2222', 'train-mnist-worker-2.kubeflow-andrey.svc:2222', 'train-mnist-worker-3.kubeflow-andrey.svc:2222', 'train-mnist-worker-4.kubeflow-andrey.svc:2222']}, task_type = 'worker', task_id = 0, num_workers = 5, local_devices = ('/job:worker/task:0/device:CPU:0',), communication = CommunicationImplementation.RING\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\n", - "11490434/11490434 [==============================] - 0s 0us/step\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:56Z INFO Running Distributed Training\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:56Z INFO --------------------------------------------------------------------------------------\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:56.666389: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: \"TensorSliceDataset/_2\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: op: \"TensorSliceDataset\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: input: \"Placeholder/_0\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: input: \"Placeholder/_1\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"Toutput_types\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: list {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: type: DT_FLOAT\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: type: DT_INT64\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"_cardinality\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: i: 60000\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"is_files\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: b: false\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"metadata\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: s: \"\\n\\024TensorSliceDataset:0\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"output_shapes\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: list {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: shape {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: dim {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: size: 28\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: dim {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: size: 28\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: shape {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: experimental_type {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_PRODUCT\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_DATASET\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_PRODUCT\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_TENSOR\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_FLOAT\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_TENSOR\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_INT64\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:56.901683: W tensorflow/core/framework/dataset.cc:768] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:04Z INFO Epoch 1/13. accuracy=0.7755 - loss=0.7565\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:09Z INFO Epoch 2/13. accuracy=0.9104 - loss=0.2964\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:13Z INFO Epoch 3/13. accuracy=0.9371 - loss=0.2100\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:18Z INFO Epoch 4/13. accuracy=0.9475 - loss=0.1756\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:23Z INFO Epoch 5/13. accuracy=0.9505 - loss=0.1612\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:27Z INFO Epoch 6/13. accuracy=0.9608 - loss=0.1309\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:32Z INFO Epoch 7/13. accuracy=0.9613 - loss=0.1298\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:37Z INFO Epoch 8/13. accuracy=0.9645 - loss=0.1165\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:41Z INFO Epoch 9/13. accuracy=0.9717 - loss=0.0962\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:46Z INFO Epoch 10/13. accuracy=0.9719 - loss=0.0920\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:51Z INFO Epoch 11/13. accuracy=0.9743 - loss=0.0873\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:55Z INFO Epoch 12/13. accuracy=0.9751 - loss=0.0831\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:10:00Z INFO Epoch 13/13. accuracy=0.9765 - loss=0.0803\n" - ] } - ], - "source": [ - "tfjob_client.get_job_logs(name=tfjob_name, namespace=namespace, is_master=True, follow=True, job_kind=\"TFJob\")" - ] + } }, { "cell_type": "markdown", - "id": "227c0a9a-fdf5-4047-b0e2-ec15d3c120ac", - "metadata": { - "execution": { - "iopub.execute_input": "2022-08-09T23:50:29.596391Z", - "iopub.status.busy": "2022-08-09T23:50:29.596145Z", - "iopub.status.idle": "2022-08-09T23:50:29.599222Z", - "shell.execute_reply": "2022-08-09T23:50:29.598674Z", - "shell.execute_reply.started": "2022-08-09T23:50:29.596363Z" - }, - "pycharm": { - "name": "#%% md\n" - } - }, "source": [ "## Delete Katib Experiment and TFJob\n", "\n", "When jobs are finished, you can delete the resources." - ] + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } }, { "cell_type": "code", - "execution_count": 18, - "id": "dd24acd8-4305-463e-a6e6-eed16d8a7c51", + "execution_count": null, + "outputs": [], + "source": [ + "katib_client.delete_experiment(exp_name)" + ], "metadata": { - "execution": { - "iopub.execute_input": "2022-09-12T19:10:19.053646Z", - "iopub.status.busy": "2022-09-12T19:10:19.052424Z", - "iopub.status.idle": "2022-09-12T19:10:19.144593Z", - "shell.execute_reply": "2022-09-12T19:10:19.143396Z", - "shell.execute_reply.started": "2022-09-12T19:10:19.053607Z" - }, + "collapsed": false, "pycharm": { "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment tune-mnist has been deleted\n" - ] } - ], - "source": [ - "katib_client.delete_experiment(exp_name)" - ] + } }, { "cell_type": "code", - "execution_count": 19, - "id": "025fa4af-256d-4027-99ba-ba44c1409541", + "execution_count": null, + "outputs": [], + "source": [ + "tfjob_client.delete_job(tfjob_name, namespace=namespace, job_kind=\"TFJob\")" + ], "metadata": { - "execution": { - "iopub.execute_input": "2022-09-12T19:10:19.532471Z", - "iopub.status.busy": "2022-09-12T19:10:19.531949Z", - "iopub.status.idle": "2022-09-12T19:10:19.550331Z", - "shell.execute_reply": "2022-09-12T19:10:19.549103Z", - "shell.execute_reply.started": "2022-09-12T19:10:19.532441Z" - }, + "collapsed": false, "pycharm": { "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-09-12T19:10:19Z INFO TFJob train-mnist has been deleted\n" - ] } - ], - "source": [ - "tfjob_client.delete_job(tfjob_name, namespace=namespace, job_kind=\"TFJob\")" - ] + } }, { "cell_type": "code", "execution_count": null, "id": "e238a638-cf77-423f-a346-f763fc8b1582", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "metadata": {}, "outputs": [], "source": [] }