From 7d4bff3fb7580892df93f84e69c2a9efcf9f3ee3 Mon Sep 17 00:00:00 2001 From: Yehudit Kerido Date: Mon, 2 Dec 2024 10:47:41 +0200 Subject: [PATCH] Add wait for experiment success condition and parameterized ns Signed-off-by: Yehudit Kerido --- ...cmaes-and-resume-policies-checkpoint.ipynb | 977 ------------------ .../tune-train-from-func-checkpoint.ipynb | 695 ------------- .../sdk/cmaes-and-resume-policies.ipynb | 39 +- .../v1beta1/sdk/tune-train-from-func.ipynb | 175 ++-- 4 files changed, 137 insertions(+), 1749 deletions(-) delete mode 100644 examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb delete mode 100644 examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb diff --git a/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb b/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb deleted file mode 100644 index be8737ee8f8..00000000000 --- a/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb +++ /dev/null @@ -1,977 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "# HyperParameter tunning using CMA-ES\n", - "\n", - "In this example you will deploy 3 Katib Experiments with Covariance Matrix Adaptation Evolution Strategy (CMA-ES) using Jupyter Notebook and Katib SDK. These Experiments have various resume policies.\n", - "\n", - "Reference documentation:\n", - "- https://www.kubeflow.org/docs/components/katib/experiment/#cmaes\n", - "- https://www.kubeflow.org/docs/components/katib/resume-experiment/\n", - "\n", - "The notebook shows how to create, get, check status and delete an Experiment." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Install Katib SDK\n", - "\n", - "You need to install Katib SDK to run this Notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n", - "!pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Import required packages" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "import copy\n", - "\n", - "from kubeflow.katib import KatibClient\n", - "from kubernetes.client import V1ObjectMeta\n", - "from kubeflow.katib import V1beta1Experiment\n", - "from kubeflow.katib import V1beta1AlgorithmSpec\n", - "from kubeflow.katib import V1beta1ObjectiveSpec\n", - "from kubeflow.katib import V1beta1FeasibleSpace\n", - "from kubeflow.katib import V1beta1ExperimentSpec\n", - "from kubeflow.katib import V1beta1ObjectiveSpec\n", - "from kubeflow.katib import V1beta1ParameterSpec\n", - "from kubeflow.katib import V1beta1TrialTemplate\n", - "from kubeflow.katib import V1beta1TrialParameterSpec" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Define your Experiment\n", - "\n", - "You have to create your Experiment object before deploying it. This Experiment is similar to [this](https://github.com/kubeflow/katib/blob/master/examples/v1beta1/hp-tuning/cma-es.yaml) example." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "# Experiment name and namespace.\n", - "namespace = \"kubeflow\"\n", - "experiment_name = \"cmaes-example\"\n", - "\n", - "metadata = V1ObjectMeta(\n", - " name=experiment_name,\n", - " namespace=namespace\n", - ")\n", - "\n", - "# Algorithm specification.\n", - "algorithm_spec=V1beta1AlgorithmSpec(\n", - " algorithm_name=\"cmaes\"\n", - ")\n", - "\n", - "# Objective specification.\n", - "objective_spec=V1beta1ObjectiveSpec(\n", - " type=\"minimize\",\n", - " goal= 0.001,\n", - " objective_metric_name=\"loss\",\n", - ")\n", - "\n", - "# Experiment search space. In this example we tune learning rate, number of layer and optimizer.\n", - "parameters=[\n", - " V1beta1ParameterSpec(\n", - " name=\"lr\",\n", - " parameter_type=\"double\",\n", - " feasible_space=V1beta1FeasibleSpace(\n", - " min=\"0.01\",\n", - " max=\"0.06\"\n", - " ),\n", - " ),\n", - " V1beta1ParameterSpec(\n", - " name=\"momentum\",\n", - " parameter_type=\"double\",\n", - " feasible_space=V1beta1FeasibleSpace(\n", - " min=\"0.5\",\n", - " max=\"0.9\"\n", - " ),\n", - " ),\n", - "]\n", - "\n", - "# JSON template specification for the Trial's Worker Kubernetes Job.\n", - "trial_spec={\n", - " \"apiVersion\": \"batch/v1\",\n", - " \"kind\": \"Job\",\n", - " \"spec\": {\n", - " \"template\": {\n", - " \"metadata\": {\n", - " \"annotations\": {\n", - " \"sidecar.istio.io/inject\": \"false\"\n", - " }\n", - " },\n", - " \"spec\": {\n", - " \"containers\": [\n", - " {\n", - " \"name\": \"training-container\",\n", - " \"image\": \"docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0\",\n", - " \"command\": [\n", - " \"python3\",\n", - " \"/opt/pytorch-mnist/mnist.py\",\n", - " \"--epochs=1\",\n", - " \"--batch-size=64\",\n", - " \"--lr=${trialParameters.learningRate}\",\n", - " \"--momentum=${trialParameters.momentum}\",\n", - " ]\n", - " }\n", - " ],\n", - " \"restartPolicy\": \"Never\"\n", - " }\n", - " }\n", - " }\n", - "}\n", - "\n", - "# Configure parameters for the Trial template.\n", - "trial_template=V1beta1TrialTemplate(\n", - " primary_container_name=\"training-container\",\n", - " trial_parameters=[\n", - " V1beta1TrialParameterSpec(\n", - " name=\"learningRate\",\n", - " description=\"Learning rate for the training model\",\n", - " reference=\"lr\"\n", - " ),\n", - " V1beta1TrialParameterSpec(\n", - " name=\"momentum\",\n", - " description=\"Momentum for the training model\",\n", - " reference=\"momentum\"\n", - " ),\n", - " ],\n", - " trial_spec=trial_spec\n", - ")\n", - "\n", - "\n", - "# Experiment object.\n", - "experiment = V1beta1Experiment(\n", - " api_version=\"kubeflow.org/v1beta1\",\n", - " kind=\"Experiment\",\n", - " metadata=metadata,\n", - " spec=V1beta1ExperimentSpec(\n", - " max_trial_count=3,\n", - " parallel_trial_count=2,\n", - " max_failed_trial_count=1,\n", - " algorithm=algorithm_spec,\n", - " objective=objective_spec,\n", - " parameters=parameters,\n", - " trial_template=trial_template,\n", - " )\n", - ")" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": 47, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "experiment_never_resume_name = \"never-resume-cmaes\"\n", - "experiment_from_volume_resume_name = \"from-volume-resume-cmaes\"\n", - "\n", - "# Create new Experiments from the previous Experiment info.\n", - "# Define Experiment with Never resume.\n", - "experiment_never_resume = copy.deepcopy(experiment)\n", - "experiment_never_resume.metadata.name = experiment_never_resume_name\n", - "experiment_never_resume.spec.resume_policy = \"Never\"\n", - "experiment_never_resume.spec.max_trial_count = 4\n", - "\n", - "# Define Experiment with FromVolume resume.\n", - "experiment_from_volume_resume = copy.deepcopy(experiment)\n", - "experiment_from_volume_resume.metadata.name = experiment_from_volume_resume_name\n", - "experiment_from_volume_resume.spec.resume_policy = \"FromVolume\"\n", - "experiment_from_volume_resume.spec.max_trial_count = 4" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "You can print the Experiment's info to verify it before submission." - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cmaes-example\n", - "cmaes\n", - "-----------------\n", - "never-resume-cmaes\n", - "Never\n", - "-----------------\n", - "from-volume-resume-cmaes\n", - "FromVolume\n" - ] - } - ], - "source": [ - "print(experiment.metadata.name)\n", - "print(experiment.spec.algorithm.algorithm_name)\n", - "print(\"-----------------\")\n", - "print(experiment_never_resume.metadata.name)\n", - "print(experiment_never_resume.spec.resume_policy)\n", - "print(\"-----------------\")\n", - "print(experiment_from_volume_resume.metadata.name)\n", - "print(experiment_from_volume_resume.spec.resume_policy)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Create your Experiment\n", - "\n", - "You have to create Katib client to use the SDK." - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment kubeflow-user-example-com/cmaes-example has been created\n" - ] - }, - { - "data": { - "text/html": [ - "Katib Experiment cmaes-example link here" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Initialize KatibClient\n", - "kclient = KatibClient(namespace=namespace)\n", - "\n", - "# Create your Experiment.\n", - "kclient.create_experiment(experiment,namespace=namespace)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### Create other Experiments" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment kubeflow-user-example-com/never-resume-cmaes has been created\n" - ] - }, - { - "data": { - "text/html": [ - "Katib Experiment never-resume-cmaes link here" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment kubeflow-user-example-com/from-volume-resume-cmaes has been created\n" - ] - }, - { - "data": { - "text/html": [ - "Katib Experiment from-volume-resume-cmaes link here" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# Create Experiment with never resume.\n", - "kclient.create_experiment(experiment_never_resume,namespace=namespace)\n", - "# Create Experiment with from volume resume.\n", - "kclient.create_experiment(experiment_from_volume_resume,namespace=namespace)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Get your Experiment\n", - "\n", - "You can get your Experiment by name and receive required data." - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'api_version': 'kubeflow.org/v1beta1',\n", - " 'kind': 'Experiment',\n", - " 'metadata': {'annotations': None,\n", - " 'creation_timestamp': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", - " 'deletion_grace_period_seconds': None,\n", - " 'deletion_timestamp': None,\n", - " 'finalizers': ['update-prometheus-metrics'],\n", - " 'generate_name': None,\n", - " 'generation': 1,\n", - " 'labels': None,\n", - " 'managed_fields': [{'api_version': 'kubeflow.org/v1beta1',\n", - " 'fields_type': 'FieldsV1',\n", - " 'fields_v1': {'f:spec': {'.': {},\n", - " 'f:algorithm': {'.': {},\n", - " 'f:algorithmName': {}},\n", - " 'f:maxFailedTrialCount': {},\n", - " 'f:maxTrialCount': {},\n", - " 'f:objective': {'.': {},\n", - " 'f:additionalMetricNames': {},\n", - " 'f:goal': {},\n", - " 'f:objectiveMetricName': {},\n", - " 'f:type': {}},\n", - " 'f:parallelTrialCount': {},\n", - " 'f:parameters': {},\n", - " 'f:trialTemplate': {'.': {},\n", - " 'f:primaryContainerName': {},\n", - " 'f:trialParameters': {},\n", - " 'f:trialSpec': {'.': {},\n", - " 'f:apiVersion': {},\n", - " 'f:kind': {},\n", - " 'f:spec': {'.': {},\n", - " 'f:template': {'.': {},\n", - " 'f:metadata': {'.': {},\n", - " 'f:annotations': {'.': {},\n", - " 'f:sidecar.istio.io/inject': {}}},\n", - " 'f:spec': {'.': {},\n", - " 'f:containers': {},\n", - " 'f:restartPolicy': {}}}}}}}},\n", - " 'manager': 'OpenAPI-Generator',\n", - " 'operation': 'Update',\n", - " 'subresource': None,\n", - " 'time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal())},\n", - " {'api_version': 'kubeflow.org/v1beta1',\n", - " 'fields_type': 'FieldsV1',\n", - " 'fields_v1': {'f:metadata': {'f:finalizers': {'.': {},\n", - " 'v:\"update-prometheus-metrics\"': {}}}},\n", - " 'manager': 'katib-controller',\n", - " 'operation': 'Update',\n", - " 'subresource': None,\n", - " 'time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal())},\n", - " {'api_version': 'kubeflow.org/v1beta1',\n", - " 'fields_type': 'FieldsV1',\n", - " 'fields_v1': {'f:status': {'.': {},\n", - " 'f:conditions': {},\n", - " 'f:currentOptimalTrial': {'.': {},\n", - " 'f:observation': {}},\n", - " 'f:runningTrialList': {},\n", - " 'f:startTime': {},\n", - " 'f:trials': {},\n", - " 'f:trialsRunning': {}}},\n", - " 'manager': 'katib-controller',\n", - " 'operation': 'Update',\n", - " 'subresource': 'status',\n", - " 'time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal())}],\n", - " 'name': 'cmaes-example',\n", - " 'namespace': 'kubeflow-user-example-com',\n", - " 'owner_references': None,\n", - " 'resource_version': '26516',\n", - " 'self_link': None,\n", - " 'uid': '1d59819e-4e5f-4adc-90cc-62c2ee867f72'},\n", - " 'spec': {'algorithm': {'algorithm_name': 'cmaes', 'algorithm_settings': None},\n", - " 'early_stopping': None,\n", - " 'max_failed_trial_count': 1,\n", - " 'max_trial_count': 3,\n", - " 'metrics_collector_spec': {'collector': {'custom_collector': None,\n", - " 'kind': 'StdOut'},\n", - " 'source': None},\n", - " 'nas_config': None,\n", - " 'objective': {'additional_metric_names': ['Train-accuracy'],\n", - " 'goal': 0.99,\n", - " 'metric_strategies': [{'name': 'Validation-accuracy',\n", - " 'value': 'max'},\n", - " {'name': 'Train-accuracy',\n", - " 'value': 'max'}],\n", - " 'objective_metric_name': 'Validation-accuracy',\n", - " 'type': 'maximize'},\n", - " 'parallel_trial_count': 2,\n", - " 'parameters': [{'feasible_space': {'list': None,\n", - " 'max': '0.06',\n", - " 'min': '0.01',\n", - " 'step': None},\n", - " 'name': 'lr',\n", - " 'parameter_type': 'double'},\n", - " {'feasible_space': {'list': None,\n", - " 'max': '5',\n", - " 'min': '2',\n", - " 'step': None},\n", - " 'name': 'num-layers',\n", - " 'parameter_type': 'int'},\n", - " {'feasible_space': {'list': ['sgd', 'adam', 'ftrl'],\n", - " 'max': None,\n", - " 'min': None,\n", - " 'step': None},\n", - " 'name': 'optimizer',\n", - " 'parameter_type': 'categorical'}],\n", - " 'resume_policy': 'LongRunning',\n", - " 'trial_template': {'config_map': None,\n", - " 'failure_condition': 'status.conditions.#(type==\"Failed\")#|#(status==\"True\")#',\n", - " 'primary_container_name': 'training-container',\n", - " 'primary_pod_labels': None,\n", - " 'retain': None,\n", - " 'success_condition': 'status.conditions.#(type==\"Complete\")#|#(status==\"True\")#',\n", - " 'trial_parameters': [{'description': 'Learning '\n", - " 'rate for '\n", - " 'the '\n", - " 'training '\n", - " 'model',\n", - " 'name': 'learningRate',\n", - " 'reference': 'lr'},\n", - " {'description': 'Number of '\n", - " 'training '\n", - " 'model '\n", - " 'layers',\n", - " 'name': 'numberLayers',\n", - " 'reference': 'num-layers'},\n", - " {'description': 'Training '\n", - " 'model '\n", - " 'optimizer '\n", - " '(sdg, adam '\n", - " 'or ftrl)',\n", - " 'name': 'optimizer',\n", - " 'reference': 'optimizer'}],\n", - " 'trial_spec': {'apiVersion': 'batch/v1',\n", - " 'kind': 'Job',\n", - " 'spec': {'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n", - " 'spec': {'containers': [{'command': ['python3',\n", - " '/opt/mxnet-mnist/mnist.py',\n", - " '--batch-size=64',\n", - " '--num-epochs=1',\n", - " '--lr=${trialParameters.learningRate}',\n", - " '--num-layers=${trialParameters.numberLayers}',\n", - " '--optimizer=${trialParameters.optimizer}'],\n", - " 'image': 'docker.io/kubeflowkatib/mxnet-mnist:v0.14.0',\n", - " 'name': 'training-container'}],\n", - " 'restartPolicy': 'Never'}}}}}},\n", - " 'status': {'completion_time': None,\n", - " 'conditions': [{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", - " 'message': 'Experiment is created',\n", - " 'reason': 'ExperimentCreated',\n", - " 'status': 'True',\n", - " 'type': 'Created'},\n", - " {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", - " 'message': 'Experiment is running',\n", - " 'reason': 'ExperimentRunning',\n", - " 'status': 'True',\n", - " 'type': 'Running'}],\n", - " 'current_optimal_trial': {'best_trial_name': None,\n", - " 'observation': {'metrics': None},\n", - " 'parameter_assignments': None},\n", - " 'early_stopped_trial_list': None,\n", - " 'failed_trial_list': None,\n", - " 'killed_trial_list': None,\n", - " 'last_reconcile_time': None,\n", - " 'metrics_unavailable_trial_list': None,\n", - " 'pending_trial_list': None,\n", - " 'running_trial_list': ['cmaes-example-f64n8vb5',\n", - " 'cmaes-example-l6zkx5jx'],\n", - " 'start_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", - " 'succeeded_trial_list': None,\n", - " 'trial_metrics_unavailable': None,\n", - " 'trials': 2,\n", - " 'trials_early_stopped': None,\n", - " 'trials_failed': None,\n", - " 'trials_killed': None,\n", - " 'trials_pending': None,\n", - " 'trials_running': 2,\n", - " 'trials_succeeded': None}}\n", - "-----------------\n", - "\n", - "3\n", - "{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", - " 'message': 'Experiment is running',\n", - " 'reason': 'ExperimentRunning',\n", - " 'status': 'True',\n", - " 'type': 'Running'}\n" - ] - } - ], - "source": [ - "exp = kclient.get_experiment(name=experiment_name, namespace=namespace)\n", - "print(exp)\n", - "print(\"-----------------\\n\")\n", - "\n", - "# Get the max trial count and latest status.\n", - "print(exp.spec.max_trial_count)\n", - "print(exp.status.conditions[-1])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Get all Experiments\n", - "\n", - "You can get list of the current Experiments." - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "cmaes-example\n", - "from-volume-resume-cmaes\n", - "never-resume-cmaes\n" - ] - } - ], - "source": [ - "# Get names from the running Experiments.\n", - "exp_list = kclient.list_experiments(namespace=namespace)\n", - "\n", - "for exp in exp_list:\n", - " print(exp.metadata.name)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Get the current Experiment conditions\n", - "\n", - "You can check the current Experiment conditions and check if Experiment is Succeeded." - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n", - " 'message': 'Experiment is created',\n", - " 'reason': 'ExperimentCreated',\n", - " 'status': 'True',\n", - " 'type': 'Created'},\n", - " {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n", - " 'message': 'Experiment is running',\n", - " 'reason': 'ExperimentRunning',\n", - " 'status': 'True',\n", - " 'type': 'Running'}]" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "kclient.get_experiment_conditions(name=experiment_name, namespace=namespace)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "kclient.is_experiment_succeeded(name=experiment_name, namespace=namespace)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## List of the current Trials\n", - "\n", - "You can get list of the current Trials with the latest status." - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "scrolled": true - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Trial Name: cmaes-example-dd4x6tsh\n", - "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n", - " 'message': 'Trial is running',\n", - " 'reason': 'TrialRunning',\n", - " 'status': 'True',\n", - " 'type': 'Running'}\n", - "\n", - "Trial Name: cmaes-example-f64n8vb5\n", - "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n", - " 'message': 'Trial has succeeded',\n", - " 'reason': 'TrialSucceeded',\n", - " 'status': 'True',\n", - " 'type': 'Succeeded'}\n", - "\n", - "Trial Name: cmaes-example-l6zkx5jx\n", - "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 45, tzinfo=tzlocal()),\n", - " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 45, tzinfo=tzlocal()),\n", - " 'message': 'Trial has succeeded',\n", - " 'reason': 'TrialSucceeded',\n", - " 'status': 'True',\n", - " 'type': 'Succeeded'}\n" - ] - } - ], - "source": [ - "# Trial list.\n", - "trial_list = kclient.list_trials(experiment_name=experiment_name, namespace=namespace)\n", - "for trial in trial_list:\n", - " print(f\"Trial Name: {trial.metadata.name}\")\n", - " print(f\"Trial Status: {trial.status.conditions[-1]}\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Get the optimal HyperParameters\n", - "\n", - "You can get the current optimal Trial from your Experiment. For the each metric you can see the max, min and latest value." - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'best_trial_name': 'cmaes-example-l6zkx5jx',\n", - " 'observation': {'metrics': [{'latest': '0.955613',\n", - " 'max': '0.955613',\n", - " 'min': '0.955613',\n", - " 'name': 'Validation-accuracy'},\n", - " {'latest': '0.922775',\n", - " 'max': '0.922775',\n", - " 'min': '0.922775',\n", - " 'name': 'Train-accuracy'}]},\n", - " 'parameter_assignments': [{'name': 'lr', 'value': '0.04511033252270099'},\n", - " {'name': 'num-layers', 'value': '3'},\n", - " {'name': 'optimizer', 'value': 'sgd'}]}" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Optimal HPs.\n", - "kclient.get_optimal_hyperparameters(name=experiment_name, namespace=namespace)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Status for the Suggestion objects\n", - "\n", - "Once Experiment is Succeeded, you can check the Suggestion object status for more information about resume status.\n", - "\n", - "For Experiment with FromVolume you should be able to check created PVC." - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Suggestion is succeeded, can't be restarted\n", - "-----------------\n", - "Suggestion is succeeded, suggestion volume is not deleted, can be restarted\n" - ] - } - ], - "source": [ - "# Get the current Suggestion status for the never resume Experiment.\n", - "suggestion = kclient.get_suggestion(name=experiment_never_resume_name, namespace=namespace)\n", - "\n", - "print(suggestion.status.conditions[-1].message)\n", - "print(\"-----------------\")\n", - "\n", - "# Get the current Suggestion status for the from volume Experiment.\n", - "suggestion = kclient.get_suggestion(name=experiment_from_volume_resume_name, namespace=namespace)\n", - "\n", - "print(suggestion.status.conditions[-1].message)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Delete your Experiments\n", - "\n", - "You can delete your Experiments." - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment kubeflow-user-example-com/cmaes-example has been deleted\n", - "Experiment kubeflow-user-example-com/never-resume-cmaes has been deleted\n", - "Experiment kubeflow-user-example-com/from-volume-resume-cmaes has been deleted\n" - ] - } - ], - "source": [ - "kclient.delete_experiment(name=experiment_name, namespace=namespace)\n", - "kclient.delete_experiment(name=experiment_never_resume_name, namespace=namespace)\n", - "kclient.delete_experiment(name=experiment_from_volume_resume_name, namespace=namespace)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb b/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb deleted file mode 100644 index 547f069b545..00000000000 --- a/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb +++ /dev/null @@ -1,695 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "bf9ab16d-fbf6-4385-a7f8-133e4562e1e7", - "metadata": { - "editable": true, - "pycharm": { - "name": "#%% md\n" - }, - "slideshow": { - "slide_type": "" - }, - "tags": [] - }, - "source": [ - "# Tune and Train with Kubeflow Katib and Training Operator\n", - " \n", - "In this Notebook we are going to do the following:\n", - "\n", - "- Train Tensorflow model using Kubeflow Notebook.\n", - "- Improve the model HyperParameters with [Kubeflow Katib](https://www.kubeflow.org/docs/components/katib/overview/).\n", - "- Use [Multi Worker Mirrored Strategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) to distributively train the model with [Kubeflow TFJob](https://www.kubeflow.org/docs/components/training/tftraining/)." - ] - }, - { - "cell_type": "markdown", - "id": "62d91e3d-904a-4a3c-b4e7-573324ba625e", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Install Kubeflow Python SDKs\n", - "\n", - "You need to install Tensorflow package and Kubeflow SDKs to run this Notebook." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5de885ca-e96a-4d59-9e78-75f6fc6f5ce7", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "!pip install tensorflow==2.16.1\n", - "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n", - "!pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1\n", - "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Create Train Script for CNN Model\n", - "\n", - "This is simple **Convolutional Neural Network (CNN)** model for recognizing hand-written digits using [MNIST Dataset](http://yann.lecun.com/exdb/mnist/). " - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "def train_mnist_model(parameters):\n", - " import tensorflow as tf\n", - " import numpy as np\n", - " import logging\n", - "\n", - " logging.basicConfig(\n", - " format=\"%(asctime)s %(levelname)-8s %(message)s\",\n", - " datefmt=\"%Y-%m-%dT%H:%M:%SZ\",\n", - " level=logging.INFO,\n", - " )\n", - " logging.info(\"--------------------------------------------------------------------------------------\")\n", - " logging.info(f\"Input Parameters: {parameters}\")\n", - " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n", - "\n", - "\n", - " # Get HyperParameters from the input params dict.\n", - " lr = float(parameters[\"lr\"])\n", - " num_epoch = int(parameters[\"num_epoch\"])\n", - "\n", - " # Set dist parameters and strategy.\n", - " is_dist = parameters[\"is_dist\"]\n", - " num_workers = parameters[\"num_workers\"]\n", - " batch_size_per_worker = 64\n", - " batch_size_global = batch_size_per_worker * num_workers\n", - " strategy = tf.distribute.MultiWorkerMirroredStrategy(\n", - " communication_options=tf.distribute.experimental.CommunicationOptions(\n", - " implementation=tf.distribute.experimental.CollectiveCommunication.RING\n", - " )\n", - " )\n", - "\n", - " # Callback class for logging training.\n", - " # Katib parses metrics in this format: =.\n", - " class CustomCallback(tf.keras.callbacks.Callback):\n", - " def on_epoch_end(self, epoch, logs=None):\n", - " logging.info(\n", - " \"Epoch {}/{}. accuracy={:.4f} - loss={:.4f}\".format(\n", - " epoch+1, num_epoch, logs[\"accuracy\"], logs[\"loss\"]\n", - " )\n", - " )\n", - "\n", - " # Prepare MNIST Dataset.\n", - " def mnist_dataset(batch_size):\n", - " (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()\n", - " x_train = x_train / np.float32(255)\n", - " y_train = y_train.astype(np.int64)\n", - " train_dataset = (\n", - " tf.data.Dataset.from_tensor_slices((x_train, y_train))\n", - " .shuffle(60000)\n", - " .repeat()\n", - " .batch(batch_size)\n", - " )\n", - " return train_dataset\n", - "\n", - " # Build and compile CNN Model.\n", - " def build_and_compile_cnn_model():\n", - " model = tf.keras.Sequential(\n", - " [\n", - " tf.keras.layers.InputLayer(input_shape=(28, 28)),\n", - " tf.keras.layers.Reshape(target_shape=(28, 28, 1)),\n", - " tf.keras.layers.Conv2D(32, 3, activation=\"relu\"),\n", - " tf.keras.layers.Flatten(),\n", - " tf.keras.layers.Dense(128, activation=\"relu\"),\n", - " tf.keras.layers.Dense(10),\n", - " ]\n", - " )\n", - " model.compile(\n", - " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n", - " optimizer=tf.keras.optimizers.SGD(learning_rate=lr),\n", - " metrics=[\"accuracy\"],\n", - " )\n", - " return model\n", - " \n", - " # Download Dataset.\n", - " dataset = mnist_dataset(batch_size_global)\n", - "\n", - " # For dist strategy we should build model under scope().\n", - " if is_dist:\n", - " logging.info(\"Running Distributed Training\")\n", - " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n", - " with strategy.scope():\n", - " model = build_and_compile_cnn_model()\n", - " else:\n", - " logging.info(\"Running Single Worker Training\")\n", - " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n", - " model = build_and_compile_cnn_model()\n", - " \n", - " # Start Training.\n", - " model.fit(\n", - " dataset,\n", - " epochs=num_epoch,\n", - " steps_per_epoch=70,\n", - " callbacks=[CustomCallback()],\n", - " verbose=0,\n", - " )" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Run Training Locally in the Notebook\n", - "\n", - "We are going to download MNIST Dataset and start local training.\n", - "\n", - "Also, set `Epochs = 2` to reduce training time and avoid CPU overload. " - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "# Set Parameters for Local Training.\n", - "parameters = {\n", - " \"lr\": \"0.1\",\n", - " \"num_epoch\": \"2\",\n", - " \"is_dist\": False,\n", - " \"num_workers\": 1\n", - "}\n", - "\n", - "# Train Model locally in the Notebook.\n", - "train_mnist_model(parameters)" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "## Start Model Tuning with Katib\n", - "\n", - "If you want to improve your model, you can run HyperParameter tuning with Katib.\n", - "\n", - "The following example uses **Covariance Matrix Adaptation Evolution Strategy (CMA-ES)** algorithm to tune HyperParameters.\n", - "\n", - "We are going to tune `learning rate` and `number of epochs`." - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "import kubeflow.katib as katib\n", - "\n", - "# Set parameters with their distribution for HyperParameter Tuning with Katib.\n", - "parameters = {\n", - " \"lr\": katib.search.double(min=0.1, max=0.2),\n", - " \"num_epoch\": katib.search.int(min=10, max=15),\n", - " \"is_dist\": False,\n", - " \"num_workers\": 1\n", - "}\n", - "\n", - "# Start the Katib Experiment.\n", - "exp_name = \"tune-mnist\"\n", - "katib_client = katib.KatibClient(namespace=namespace)\n", - "\n", - "katib_client.tune(\n", - " name=exp_name,\n", - " objective=train_mnist_model, # Objective function.\n", - " parameters=parameters, # HyperParameters to tune.\n", - " algorithm_name=\"cmaes\", # Alorithm to use.\n", - " objective_metric_name=\"accuracy\", # Katib is going to optimize \"accuracy\".\n", - " additional_metric_names=[\"loss\"], # Katib is going to collect these metrics in addition to the objective metric.\n", - " max_trial_count=12, # Trial Threshold.\n", - " parallel_trial_count=2,\n", - ")" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "### Access to Katib UI\n", - "\n", - "You can check created Experiment in the Katib UI.\n", - "\n", - "![Screenshot 2022-09-12 at 20.06.23.png](attachment:cdaf463d-28b3-4a98-bb4c-9613ca1bfa50.png)" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "markdown", - "source": [ - "### Get the Best HyperParameters from the Katib Experiment\n", - "\n", - "You can get the best HyperParameters from the most optimal Katib Trial." - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "import time\n", - "time.sleep(120)\n", - "status = katib_client.is_experiment_succeeded(exp_name, namespace=namespace)\n", - "print(f\"Katib Experiment is Succeeded: {status}\\n\")\n", - "\n", - "best_hps = katib_client.get_optimal_hyperparameters(exp_name, namespace=namespace)\n", - "\n", - "if best_hps != None:\n", - " print(\"Current Optimal Trial\\n\")\n", - " print(best_hps)\n", - " \n", - " for hp in best_hps.parameter_assignments:\n", - " if hp.name == \"lr\":\n", - " best_lr = hp.value\n", - " else:\n", - " best_num_epoch = hp.value" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "6289e27f-325d-4433-9379-7e97bc8aae69", - "metadata": { - "execution": { - "iopub.execute_input": "2022-09-12T19:08:51.055746Z", - "iopub.status.busy": "2022-09-12T19:08:51.054605Z", - "iopub.status.idle": "2022-09-12T19:08:51.246141Z", - "shell.execute_reply": "2022-09-12T19:08:51.244919Z", - "shell.execute_reply.started": "2022-09-12T19:08:51.055713Z" - }, - "pycharm": { - "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-09-12T19:08:51Z INFO TFJob train-mnist has been created\n" - ] - } - ], - "source": [ - "from kubeflow.training import TrainingClient\n", - "\n", - "# Set Parameters for Distributed Training with TFJob.\n", - "parameters = {\n", - " \"lr\": best_lr,\n", - " \"num_epoch\": best_num_epoch,\n", - " \"is_dist\": True,\n", - " \"num_workers\": 5\n", - "}\n", - "\n", - "# Start TFJob Training.\n", - "tfjob_name = \"train-mnist\"\n", - "tfjob_client = TrainingClient(namespace=namespace)\n", - "\n", - "#create_tfjob_from_func\n", - "tfjob_client.create_job(\n", - " name=tfjob_name,\n", - " namespace=namespace,\n", - " job_kind=\"TFJob\",\n", - " train_func=train_mnist_model,\n", - " parameters=parameters, # Input parameters for the train function.\n", - " num_workers=5, # How many TFJob Workers will be run.\n", - " base_image=\"tensorflow/tensorflow:2.10.0\", # Use TensorFlow image\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "d5d465e8-0310-4c72-ad36-209259ad5c34", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "### Get TFJob Status and Training Logs\n", - "\n", - "You can check the TFJob status and logs." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "53859cf4-7a35-4fc4-b5ee-9ba774635df0", - "metadata": { - "execution": { - "iopub.execute_input": "2022-09-12T19:10:06.862146Z", - "iopub.status.busy": "2022-09-12T19:10:06.861177Z", - "iopub.status.idle": "2022-09-12T19:10:06.945011Z", - "shell.execute_reply": "2022-09-12T19:10:06.943629Z", - "shell.execute_reply.started": "2022-09-12T19:10:06.862104Z" - }, - "pycharm": { - "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "TFJob status: Succeeded\n" - ] - } - ], - "source": [ - "print(f\"TFJob status: {tfjob_client.get_job_conditions(tfjob_name, namespace=namespace, job_kind='TFJob')}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "f247670e-0bd4-4336-a40c-605ce32fad23", - "metadata": { - "execution": { - "iopub.execute_input": "2022-09-12T19:10:11.765592Z", - "iopub.status.busy": "2022-09-12T19:10:11.764384Z", - "iopub.status.idle": "2022-09-12T19:10:14.249858Z", - "shell.execute_reply": "2022-09-12T19:10:14.248518Z", - "shell.execute_reply.started": "2022-09-12T19:10:11.765560Z" - }, - "pycharm": { - "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO --------------------------------------------------------------------------------------\n", - "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO Input Parameters: {'lr': '0.17016692449867332', 'num_epoch': '13', 'is_dist': True, 'num_workers': 5}\n", - "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO --------------------------------------------------------------------------------------\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:53.988515: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.008619: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> train-mnist-worker-0.kubeflow-andrey.svc:2222, 1 -> train-mnist-worker-1.kubeflow-andrey.svc:2222, 2 -> train-mnist-worker-2.kubeflow-andrey.svc:2222, 3 -> train-mnist-worker-3.kubeflow-andrey.svc:2222, 4 -> train-mnist-worker-4.kubeflow-andrey.svc:2222}\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.008700: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> train-mnist-worker-0.kubeflow-andrey.svc:2222, 1 -> train-mnist-worker-1.kubeflow-andrey.svc:2222, 2 -> train-mnist-worker-2.kubeflow-andrey.svc:2222, 3 -> train-mnist-worker-3.kubeflow-andrey.svc:2222, 4 -> train-mnist-worker-4.kubeflow-andrey.svc:2222}\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.009579: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:438] Started server with target: grpc://train-mnist-worker-0.kubeflow-andrey.svc:2222\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:4/device:CPU:0', '/job:worker/replica:0/task:2/device:CPU:0', '/job:worker/replica:0/task:3/device:CPU:0', '/job:worker/replica:0/task:1/device:CPU:0']\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:4/device:CPU:0', '/job:worker/replica:0/task:2/device:CPU:0', '/job:worker/replica:0/task:3/device:CPU:0', '/job:worker/replica:0/task:1/device:CPU:0']\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Check health not enabled.\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO Check health not enabled.\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['train-mnist-worker-0.kubeflow-andrey.svc:2222', 'train-mnist-worker-1.kubeflow-andrey.svc:2222', 'train-mnist-worker-2.kubeflow-andrey.svc:2222', 'train-mnist-worker-3.kubeflow-andrey.svc:2222', 'train-mnist-worker-4.kubeflow-andrey.svc:2222']}, task_type = 'worker', task_id = 0, num_workers = 5, local_devices = ('/job:worker/task:0/device:CPU:0',), communication = CommunicationImplementation.RING\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['train-mnist-worker-0.kubeflow-andrey.svc:2222', 'train-mnist-worker-1.kubeflow-andrey.svc:2222', 'train-mnist-worker-2.kubeflow-andrey.svc:2222', 'train-mnist-worker-3.kubeflow-andrey.svc:2222', 'train-mnist-worker-4.kubeflow-andrey.svc:2222']}, task_type = 'worker', task_id = 0, num_workers = 5, local_devices = ('/job:worker/task:0/device:CPU:0',), communication = CommunicationImplementation.RING\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\n", - "11490434/11490434 [==============================] - 0s 0us/step\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:56Z INFO Running Distributed Training\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:56Z INFO --------------------------------------------------------------------------------------\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:56.666389: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: \"TensorSliceDataset/_2\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: op: \"TensorSliceDataset\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: input: \"Placeholder/_0\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: input: \"Placeholder/_1\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"Toutput_types\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: list {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: type: DT_FLOAT\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: type: DT_INT64\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"_cardinality\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: i: 60000\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"is_files\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: b: false\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"metadata\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: s: \"\\n\\024TensorSliceDataset:0\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"output_shapes\"\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: list {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: shape {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: dim {\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: size: 28\n", - "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: dim {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: size: 28\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: shape {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: experimental_type {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_PRODUCT\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_DATASET\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_PRODUCT\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_TENSOR\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_FLOAT\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_TENSOR\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_INT64\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:56.901683: W tensorflow/core/framework/dataset.cc:768] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:04Z INFO Epoch 1/13. accuracy=0.7755 - loss=0.7565\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:09Z INFO Epoch 2/13. accuracy=0.9104 - loss=0.2964\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:13Z INFO Epoch 3/13. accuracy=0.9371 - loss=0.2100\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:18Z INFO Epoch 4/13. accuracy=0.9475 - loss=0.1756\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:23Z INFO Epoch 5/13. accuracy=0.9505 - loss=0.1612\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:27Z INFO Epoch 6/13. accuracy=0.9608 - loss=0.1309\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:32Z INFO Epoch 7/13. accuracy=0.9613 - loss=0.1298\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:37Z INFO Epoch 8/13. accuracy=0.9645 - loss=0.1165\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:41Z INFO Epoch 9/13. accuracy=0.9717 - loss=0.0962\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:46Z INFO Epoch 10/13. accuracy=0.9719 - loss=0.0920\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:51Z INFO Epoch 11/13. accuracy=0.9743 - loss=0.0873\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:55Z INFO Epoch 12/13. accuracy=0.9751 - loss=0.0831\n", - "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:10:00Z INFO Epoch 13/13. accuracy=0.9765 - loss=0.0803\n" - ] - } - ], - "source": [ - "tfjob_client.get_job_logs(name=tfjob_name, namespace=namespace, is_master=True, follow=True, job_kind=\"TFJob\")" - ] - }, - { - "cell_type": "markdown", - "id": "227c0a9a-fdf5-4047-b0e2-ec15d3c120ac", - "metadata": { - "execution": { - "iopub.execute_input": "2022-08-09T23:50:29.596391Z", - "iopub.status.busy": "2022-08-09T23:50:29.596145Z", - "iopub.status.idle": "2022-08-09T23:50:29.599222Z", - "shell.execute_reply": "2022-08-09T23:50:29.598674Z", - "shell.execute_reply.started": "2022-08-09T23:50:29.596363Z" - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "## Delete Katib Experiment and TFJob\n", - "\n", - "When jobs are finished, you can delete the resources." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "dd24acd8-4305-463e-a6e6-eed16d8a7c51", - "metadata": { - "execution": { - "iopub.execute_input": "2022-09-12T19:10:19.053646Z", - "iopub.status.busy": "2022-09-12T19:10:19.052424Z", - "iopub.status.idle": "2022-09-12T19:10:19.144593Z", - "shell.execute_reply": "2022-09-12T19:10:19.143396Z", - "shell.execute_reply.started": "2022-09-12T19:10:19.053607Z" - }, - "pycharm": { - "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment tune-mnist has been deleted\n" - ] - } - ], - "source": [ - "katib_client.delete_experiment(exp_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "025fa4af-256d-4027-99ba-ba44c1409541", - "metadata": { - "execution": { - "iopub.execute_input": "2022-09-12T19:10:19.532471Z", - "iopub.status.busy": "2022-09-12T19:10:19.531949Z", - "iopub.status.idle": "2022-09-12T19:10:19.550331Z", - "shell.execute_reply": "2022-09-12T19:10:19.549103Z", - "shell.execute_reply.started": "2022-09-12T19:10:19.532441Z" - }, - "pycharm": { - "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-09-12T19:10:19Z INFO TFJob train-mnist has been deleted\n" - ] - } - ], - "source": [ - "tfjob_client.delete_job(tfjob_name, namespace=namespace, job_kind=\"TFJob\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e238a638-cf77-423f-a346-f763fc8b1582", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb b/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb index 27de8eff0b8..72445e7390f 100644 --- a/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb +++ b/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb @@ -103,22 +103,41 @@ { "cell_type": "code", "execution_count": null, + "metadata": { + "collapsed": false, + "editable": true, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + }, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "parameters" + ] + }, "outputs": [], "source": [ "# Experiment name and namespace.\n", "namespace = \"kubeflow\"\n", "experiment_name = \"cmaes-example\"\n" - ], + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%%\n" } - } - }, - { - "cell_type": "code", - "execution_count": null, + }, "outputs": [], "source": [ "\n", @@ -225,13 +244,7 @@ " trial_template=trial_template,\n", " )\n", ")" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + ] }, { "cell_type": "code", diff --git a/examples/v1beta1/sdk/tune-train-from-func.ipynb b/examples/v1beta1/sdk/tune-train-from-func.ipynb index b5ab52681e5..d42cbad4312 100644 --- a/examples/v1beta1/sdk/tune-train-from-func.ipynb +++ b/examples/v1beta1/sdk/tune-train-from-func.ipynb @@ -62,35 +62,60 @@ { "cell_type": "code", "execution_count": null, - "outputs": [], - "source": [ - "# Experiment namespace.\n", - "namespace = \"default\" " - ], + "id": "b807dfbb", "metadata": { "collapsed": false, + "editable": true, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%%\n" - } - } + }, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "# Experiment namespace.\n", + "namespace = \"default\" " + ] }, { "cell_type": "markdown", - "source": [ - "## Create Train Script for CNN Model\n", - "\n", - "This is simple **Convolutional Neural Network (CNN)** model for recognizing hand-written digits using [MNIST Dataset](http://yann.lecun.com/exdb/mnist/). " - ], + "id": "9f319483", "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%% md\n" } - } + }, + "source": [ + "## Create Train Script for CNN Model\n", + "\n", + "This is simple **Convolutional Neural Network (CNN)** model for recognizing hand-written digits using [MNIST Dataset](http://yann.lecun.com/exdb/mnist/). " + ] }, { "cell_type": "code", "execution_count": null, + "id": "727ec914", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "def train_mnist_model(parameters):\n", @@ -187,33 +212,41 @@ " callbacks=[CustomCallback()],\n", " verbose=0,\n", " )" - ], + ] + }, + { + "cell_type": "markdown", + "id": "4b8a6dde", "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "## Run Training Locally in the Notebook\n", "\n", "We are going to download MNIST Dataset and start local training.\n", "\n", "Also, set `Epochs = 2` to reduce training time and avoid CPU overload. " - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "id": "18e13f06", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "# Set Parameters for Local Training.\n", @@ -226,16 +259,20 @@ "\n", "# Train Model locally in the Notebook.\n", "train_mnist_model(parameters)" - ], + ] + }, + { + "cell_type": "markdown", + "id": "e73eb9a5", "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "## Start Model Tuning with Katib\n", "\n", @@ -244,17 +281,21 @@ "The following example uses **Covariance Matrix Adaptation Evolution Strategy (CMA-ES)** algorithm to tune HyperParameters.\n", "\n", "We are going to tune `learning rate` and `number of epochs`." - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "id": "841345df", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "import kubeflow.katib as katib\n", @@ -281,47 +322,59 @@ " max_trial_count=12, # Trial Threshold.\n", " parallel_trial_count=2,\n", ")" - ], + ] + }, + { + "cell_type": "markdown", + "id": "13fb2f5c", "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { - "name": "#%%\n" + "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "### Access to Katib UI\n", "\n", "You can check created Experiment in the Katib UI.\n", "\n", "![Screenshot 2022-09-12 at 20.06.23.png](attachment:cdaf463d-28b3-4a98-bb4c-9613ca1bfa50.png)" - ], + ] + }, + { + "cell_type": "markdown", + "id": "f24c9fd7", "metadata": { "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, "pycharm": { "name": "#%% md\n" } - } - }, - { - "cell_type": "markdown", + }, "source": [ "### Get the Best HyperParameters from the Katib Experiment\n", "\n", "You can get the best HyperParameters from the most optimal Katib Trial." - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%% md\n" - } - } + ] }, { "cell_type": "code", "execution_count": null, + "id": "964e2f4c", + "metadata": { + "collapsed": false, + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, "outputs": [], "source": [ "katib_client.wait_for_experiment_condition(exp_name, namespace=namespace)\n", @@ -339,13 +392,7 @@ " best_lr = hp.value\n", " else:\n", " best_num_epoch = hp.value" - ], - "metadata": { - "collapsed": false, - "pycharm": { - "name": "#%%\n" - } - } + ] }, { "cell_type": "code",