diff --git a/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb b/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb
new file mode 100644
index 00000000000..72445e7390f
--- /dev/null
+++ b/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb
@@ -0,0 +1,1008 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "editable": true,
+ "pycharm": {
+ "name": "#%% md\n"
+ },
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": []
+ },
+ "source": [
+ "# HyperParameter tunning using CMA-ES\n",
+ "\n",
+ "In this example you will deploy 3 Katib Experiments with Covariance Matrix Adaptation Evolution Strategy (CMA-ES) using Jupyter Notebook and Katib SDK. These Experiments have various resume policies.\n",
+ "\n",
+ "Reference documentation:\n",
+ "- https://www.kubeflow.org/docs/components/katib/experiment/#cmaes\n",
+ "- https://www.kubeflow.org/docs/components/katib/resume-experiment/\n",
+ "\n",
+ "The notebook shows how to create, get, check status and delete an Experiment."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Install Katib SDK\n",
+ "\n",
+ "You need to install Katib SDK to run this Notebook."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
+ "!pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Import required packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import copy\n",
+ "\n",
+ "from kubeflow.katib import KatibClient\n",
+ "from kubernetes.client import V1ObjectMeta\n",
+ "from kubeflow.katib import V1beta1Experiment\n",
+ "from kubeflow.katib import V1beta1AlgorithmSpec\n",
+ "from kubeflow.katib import V1beta1ObjectiveSpec\n",
+ "from kubeflow.katib import V1beta1FeasibleSpace\n",
+ "from kubeflow.katib import V1beta1ExperimentSpec\n",
+ "from kubeflow.katib import V1beta1ObjectiveSpec\n",
+ "from kubeflow.katib import V1beta1ParameterSpec\n",
+ "from kubeflow.katib import V1beta1TrialTemplate\n",
+ "from kubeflow.katib import V1beta1TrialParameterSpec"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Define your Experiment\n",
+ "\n",
+ "You have to create your Experiment object before deploying it. This Experiment is similar to [this](https://github.com/kubeflow/katib/blob/master/examples/v1beta1/hp-tuning/cma-es.yaml) example."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "editable": true,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": [
+ "parameters"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Experiment name and namespace.\n",
+ "namespace = \"kubeflow\"\n",
+ "experiment_name = \"cmaes-example\"\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "\n",
+ "metadata = V1ObjectMeta(\n",
+ " name=experiment_name,\n",
+ " namespace=namespace\n",
+ ")\n",
+ "\n",
+ "# Algorithm specification.\n",
+ "algorithm_spec=V1beta1AlgorithmSpec(\n",
+ " algorithm_name=\"cmaes\"\n",
+ ")\n",
+ "\n",
+ "# Objective specification.\n",
+ "objective_spec=V1beta1ObjectiveSpec(\n",
+ " type=\"minimize\",\n",
+ " goal= 0.001,\n",
+ " objective_metric_name=\"loss\",\n",
+ ")\n",
+ "\n",
+ "# Experiment search space. In this example we tune learning rate, number of layer and optimizer.\n",
+ "parameters=[\n",
+ " V1beta1ParameterSpec(\n",
+ " name=\"lr\",\n",
+ " parameter_type=\"double\",\n",
+ " feasible_space=V1beta1FeasibleSpace(\n",
+ " min=\"0.01\",\n",
+ " max=\"0.06\"\n",
+ " ),\n",
+ " ),\n",
+ " V1beta1ParameterSpec(\n",
+ " name=\"momentum\",\n",
+ " parameter_type=\"double\",\n",
+ " feasible_space=V1beta1FeasibleSpace(\n",
+ " min=\"0.5\",\n",
+ " max=\"0.9\"\n",
+ " ),\n",
+ " ),\n",
+ "]\n",
+ "\n",
+ "# JSON template specification for the Trial's Worker Kubernetes Job.\n",
+ "trial_spec={\n",
+ " \"apiVersion\": \"batch/v1\",\n",
+ " \"kind\": \"Job\",\n",
+ " \"spec\": {\n",
+ " \"template\": {\n",
+ " \"metadata\": {\n",
+ " \"annotations\": {\n",
+ " \"sidecar.istio.io/inject\": \"false\"\n",
+ " }\n",
+ " },\n",
+ " \"spec\": {\n",
+ " \"containers\": [\n",
+ " {\n",
+ " \"name\": \"training-container\",\n",
+ " \"image\": \"docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0\",\n",
+ " \"command\": [\n",
+ " \"python3\",\n",
+ " \"/opt/pytorch-mnist/mnist.py\",\n",
+ " \"--epochs=1\",\n",
+ " \"--batch-size=64\",\n",
+ " \"--lr=${trialParameters.learningRate}\",\n",
+ " \"--momentum=${trialParameters.momentum}\",\n",
+ " ]\n",
+ " }\n",
+ " ],\n",
+ " \"restartPolicy\": \"Never\"\n",
+ " }\n",
+ " }\n",
+ " }\n",
+ "}\n",
+ "\n",
+ "# Configure parameters for the Trial template.\n",
+ "trial_template=V1beta1TrialTemplate(\n",
+ " primary_container_name=\"training-container\",\n",
+ " trial_parameters=[\n",
+ " V1beta1TrialParameterSpec(\n",
+ " name=\"learningRate\",\n",
+ " description=\"Learning rate for the training model\",\n",
+ " reference=\"lr\"\n",
+ " ),\n",
+ " V1beta1TrialParameterSpec(\n",
+ " name=\"momentum\",\n",
+ " description=\"Momentum for the training model\",\n",
+ " reference=\"momentum\"\n",
+ " ),\n",
+ " ],\n",
+ " trial_spec=trial_spec\n",
+ ")\n",
+ "\n",
+ "\n",
+ "# Experiment object.\n",
+ "experiment = V1beta1Experiment(\n",
+ " api_version=\"kubeflow.org/v1beta1\",\n",
+ " kind=\"Experiment\",\n",
+ " metadata=metadata,\n",
+ " spec=V1beta1ExperimentSpec(\n",
+ " max_trial_count=3,\n",
+ " parallel_trial_count=2,\n",
+ " max_failed_trial_count=1,\n",
+ " algorithm=algorithm_spec,\n",
+ " objective=objective_spec,\n",
+ " parameters=parameters,\n",
+ " trial_template=trial_template,\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "experiment_never_resume_name = \"never-resume-cmaes\"\n",
+ "experiment_from_volume_resume_name = \"from-volume-resume-cmaes\"\n",
+ "\n",
+ "# Create new Experiments from the previous Experiment info.\n",
+ "# Define Experiment with Never resume.\n",
+ "experiment_never_resume = copy.deepcopy(experiment)\n",
+ "experiment_never_resume.metadata.name = experiment_never_resume_name\n",
+ "experiment_never_resume.spec.resume_policy = \"Never\"\n",
+ "experiment_never_resume.spec.max_trial_count = 4\n",
+ "\n",
+ "# Define Experiment with FromVolume resume.\n",
+ "experiment_from_volume_resume = copy.deepcopy(experiment)\n",
+ "experiment_from_volume_resume.metadata.name = experiment_from_volume_resume_name\n",
+ "experiment_from_volume_resume.spec.resume_policy = \"FromVolume\"\n",
+ "experiment_from_volume_resume.spec.max_trial_count = 4"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "You can print the Experiment's info to verify it before submission."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cmaes-example\n",
+ "cmaes\n",
+ "-----------------\n",
+ "never-resume-cmaes\n",
+ "Never\n",
+ "-----------------\n",
+ "from-volume-resume-cmaes\n",
+ "FromVolume\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(experiment.metadata.name)\n",
+ "print(experiment.spec.algorithm.algorithm_name)\n",
+ "print(\"-----------------\")\n",
+ "print(experiment_never_resume.metadata.name)\n",
+ "print(experiment_never_resume.spec.resume_policy)\n",
+ "print(\"-----------------\")\n",
+ "print(experiment_from_volume_resume.metadata.name)\n",
+ "print(experiment_from_volume_resume.spec.resume_policy)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Create your Experiment\n",
+ "\n",
+ "You have to create Katib client to use the SDK."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Experiment kubeflow-user-example-com/cmaes-example has been created\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Katib Experiment cmaes-example link here"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Initialize KatibClient\n",
+ "kclient = KatibClient(namespace=namespace)\n",
+ "\n",
+ "# Create your Experiment.\n",
+ "kclient.create_experiment(experiment,namespace=namespace)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Create other Experiments"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Experiment kubeflow-user-example-com/never-resume-cmaes has been created\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Katib Experiment never-resume-cmaes link here"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Experiment kubeflow-user-example-com/from-volume-resume-cmaes has been created\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "Katib Experiment from-volume-resume-cmaes link here"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Create Experiment with never resume.\n",
+ "kclient.create_experiment(experiment_never_resume,namespace=namespace)\n",
+ "# Create Experiment with from volume resume.\n",
+ "kclient.create_experiment(experiment_from_volume_resume,namespace=namespace)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Get your Experiment\n",
+ "\n",
+ "You can get your Experiment by name and receive required data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'api_version': 'kubeflow.org/v1beta1',\n",
+ " 'kind': 'Experiment',\n",
+ " 'metadata': {'annotations': None,\n",
+ " 'creation_timestamp': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
+ " 'deletion_grace_period_seconds': None,\n",
+ " 'deletion_timestamp': None,\n",
+ " 'finalizers': ['update-prometheus-metrics'],\n",
+ " 'generate_name': None,\n",
+ " 'generation': 1,\n",
+ " 'labels': None,\n",
+ " 'managed_fields': [{'api_version': 'kubeflow.org/v1beta1',\n",
+ " 'fields_type': 'FieldsV1',\n",
+ " 'fields_v1': {'f:spec': {'.': {},\n",
+ " 'f:algorithm': {'.': {},\n",
+ " 'f:algorithmName': {}},\n",
+ " 'f:maxFailedTrialCount': {},\n",
+ " 'f:maxTrialCount': {},\n",
+ " 'f:objective': {'.': {},\n",
+ " 'f:additionalMetricNames': {},\n",
+ " 'f:goal': {},\n",
+ " 'f:objectiveMetricName': {},\n",
+ " 'f:type': {}},\n",
+ " 'f:parallelTrialCount': {},\n",
+ " 'f:parameters': {},\n",
+ " 'f:trialTemplate': {'.': {},\n",
+ " 'f:primaryContainerName': {},\n",
+ " 'f:trialParameters': {},\n",
+ " 'f:trialSpec': {'.': {},\n",
+ " 'f:apiVersion': {},\n",
+ " 'f:kind': {},\n",
+ " 'f:spec': {'.': {},\n",
+ " 'f:template': {'.': {},\n",
+ " 'f:metadata': {'.': {},\n",
+ " 'f:annotations': {'.': {},\n",
+ " 'f:sidecar.istio.io/inject': {}}},\n",
+ " 'f:spec': {'.': {},\n",
+ " 'f:containers': {},\n",
+ " 'f:restartPolicy': {}}}}}}}},\n",
+ " 'manager': 'OpenAPI-Generator',\n",
+ " 'operation': 'Update',\n",
+ " 'subresource': None,\n",
+ " 'time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal())},\n",
+ " {'api_version': 'kubeflow.org/v1beta1',\n",
+ " 'fields_type': 'FieldsV1',\n",
+ " 'fields_v1': {'f:metadata': {'f:finalizers': {'.': {},\n",
+ " 'v:\"update-prometheus-metrics\"': {}}}},\n",
+ " 'manager': 'katib-controller',\n",
+ " 'operation': 'Update',\n",
+ " 'subresource': None,\n",
+ " 'time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal())},\n",
+ " {'api_version': 'kubeflow.org/v1beta1',\n",
+ " 'fields_type': 'FieldsV1',\n",
+ " 'fields_v1': {'f:status': {'.': {},\n",
+ " 'f:conditions': {},\n",
+ " 'f:currentOptimalTrial': {'.': {},\n",
+ " 'f:observation': {}},\n",
+ " 'f:runningTrialList': {},\n",
+ " 'f:startTime': {},\n",
+ " 'f:trials': {},\n",
+ " 'f:trialsRunning': {}}},\n",
+ " 'manager': 'katib-controller',\n",
+ " 'operation': 'Update',\n",
+ " 'subresource': 'status',\n",
+ " 'time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal())}],\n",
+ " 'name': 'cmaes-example',\n",
+ " 'namespace': 'kubeflow-user-example-com',\n",
+ " 'owner_references': None,\n",
+ " 'resource_version': '26516',\n",
+ " 'self_link': None,\n",
+ " 'uid': '1d59819e-4e5f-4adc-90cc-62c2ee867f72'},\n",
+ " 'spec': {'algorithm': {'algorithm_name': 'cmaes', 'algorithm_settings': None},\n",
+ " 'early_stopping': None,\n",
+ " 'max_failed_trial_count': 1,\n",
+ " 'max_trial_count': 3,\n",
+ " 'metrics_collector_spec': {'collector': {'custom_collector': None,\n",
+ " 'kind': 'StdOut'},\n",
+ " 'source': None},\n",
+ " 'nas_config': None,\n",
+ " 'objective': {'additional_metric_names': ['Train-accuracy'],\n",
+ " 'goal': 0.99,\n",
+ " 'metric_strategies': [{'name': 'Validation-accuracy',\n",
+ " 'value': 'max'},\n",
+ " {'name': 'Train-accuracy',\n",
+ " 'value': 'max'}],\n",
+ " 'objective_metric_name': 'Validation-accuracy',\n",
+ " 'type': 'maximize'},\n",
+ " 'parallel_trial_count': 2,\n",
+ " 'parameters': [{'feasible_space': {'list': None,\n",
+ " 'max': '0.06',\n",
+ " 'min': '0.01',\n",
+ " 'step': None},\n",
+ " 'name': 'lr',\n",
+ " 'parameter_type': 'double'},\n",
+ " {'feasible_space': {'list': None,\n",
+ " 'max': '5',\n",
+ " 'min': '2',\n",
+ " 'step': None},\n",
+ " 'name': 'num-layers',\n",
+ " 'parameter_type': 'int'},\n",
+ " {'feasible_space': {'list': ['sgd', 'adam', 'ftrl'],\n",
+ " 'max': None,\n",
+ " 'min': None,\n",
+ " 'step': None},\n",
+ " 'name': 'optimizer',\n",
+ " 'parameter_type': 'categorical'}],\n",
+ " 'resume_policy': 'LongRunning',\n",
+ " 'trial_template': {'config_map': None,\n",
+ " 'failure_condition': 'status.conditions.#(type==\"Failed\")#|#(status==\"True\")#',\n",
+ " 'primary_container_name': 'training-container',\n",
+ " 'primary_pod_labels': None,\n",
+ " 'retain': None,\n",
+ " 'success_condition': 'status.conditions.#(type==\"Complete\")#|#(status==\"True\")#',\n",
+ " 'trial_parameters': [{'description': 'Learning '\n",
+ " 'rate for '\n",
+ " 'the '\n",
+ " 'training '\n",
+ " 'model',\n",
+ " 'name': 'learningRate',\n",
+ " 'reference': 'lr'},\n",
+ " {'description': 'Number of '\n",
+ " 'training '\n",
+ " 'model '\n",
+ " 'layers',\n",
+ " 'name': 'numberLayers',\n",
+ " 'reference': 'num-layers'},\n",
+ " {'description': 'Training '\n",
+ " 'model '\n",
+ " 'optimizer '\n",
+ " '(sdg, adam '\n",
+ " 'or ftrl)',\n",
+ " 'name': 'optimizer',\n",
+ " 'reference': 'optimizer'}],\n",
+ " 'trial_spec': {'apiVersion': 'batch/v1',\n",
+ " 'kind': 'Job',\n",
+ " 'spec': {'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
+ " 'spec': {'containers': [{'command': ['python3',\n",
+ " '/opt/mxnet-mnist/mnist.py',\n",
+ " '--batch-size=64',\n",
+ " '--num-epochs=1',\n",
+ " '--lr=${trialParameters.learningRate}',\n",
+ " '--num-layers=${trialParameters.numberLayers}',\n",
+ " '--optimizer=${trialParameters.optimizer}'],\n",
+ " 'image': 'docker.io/kubeflowkatib/mxnet-mnist:v0.14.0',\n",
+ " 'name': 'training-container'}],\n",
+ " 'restartPolicy': 'Never'}}}}}},\n",
+ " 'status': {'completion_time': None,\n",
+ " 'conditions': [{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
+ " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
+ " 'message': 'Experiment is created',\n",
+ " 'reason': 'ExperimentCreated',\n",
+ " 'status': 'True',\n",
+ " 'type': 'Created'},\n",
+ " {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
+ " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
+ " 'message': 'Experiment is running',\n",
+ " 'reason': 'ExperimentRunning',\n",
+ " 'status': 'True',\n",
+ " 'type': 'Running'}],\n",
+ " 'current_optimal_trial': {'best_trial_name': None,\n",
+ " 'observation': {'metrics': None},\n",
+ " 'parameter_assignments': None},\n",
+ " 'early_stopped_trial_list': None,\n",
+ " 'failed_trial_list': None,\n",
+ " 'killed_trial_list': None,\n",
+ " 'last_reconcile_time': None,\n",
+ " 'metrics_unavailable_trial_list': None,\n",
+ " 'pending_trial_list': None,\n",
+ " 'running_trial_list': ['cmaes-example-f64n8vb5',\n",
+ " 'cmaes-example-l6zkx5jx'],\n",
+ " 'start_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
+ " 'succeeded_trial_list': None,\n",
+ " 'trial_metrics_unavailable': None,\n",
+ " 'trials': 2,\n",
+ " 'trials_early_stopped': None,\n",
+ " 'trials_failed': None,\n",
+ " 'trials_killed': None,\n",
+ " 'trials_pending': None,\n",
+ " 'trials_running': 2,\n",
+ " 'trials_succeeded': None}}\n",
+ "-----------------\n",
+ "\n",
+ "3\n",
+ "{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
+ " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
+ " 'message': 'Experiment is running',\n",
+ " 'reason': 'ExperimentRunning',\n",
+ " 'status': 'True',\n",
+ " 'type': 'Running'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "exp = kclient.get_experiment(name=experiment_name, namespace=namespace)\n",
+ "print(exp)\n",
+ "print(\"-----------------\\n\")\n",
+ "\n",
+ "# Get the max trial count and latest status.\n",
+ "print(exp.spec.max_trial_count)\n",
+ "print(exp.status.conditions[-1])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Get all Experiments\n",
+ "\n",
+ "You can get list of the current Experiments."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "cmaes-example\n",
+ "from-volume-resume-cmaes\n",
+ "never-resume-cmaes\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Get names from the running Experiments.\n",
+ "exp_list = kclient.list_experiments(namespace=namespace)\n",
+ "\n",
+ "for exp in exp_list:\n",
+ " print(exp.metadata.name)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Get the current Experiment conditions\n",
+ "\n",
+ "You can check the current Experiment conditions and check if Experiment is Succeeded."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
+ " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
+ " 'message': 'Experiment is created',\n",
+ " 'reason': 'ExperimentCreated',\n",
+ " 'status': 'True',\n",
+ " 'type': 'Created'},\n",
+ " {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
+ " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
+ " 'message': 'Experiment is running',\n",
+ " 'reason': 'ExperimentRunning',\n",
+ " 'status': 'True',\n",
+ " 'type': 'Running'}]"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kclient.get_experiment_conditions(name=experiment_name, namespace=namespace)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "False"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kclient.is_experiment_succeeded(name=experiment_name, namespace=namespace)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## List of the current Trials\n",
+ "\n",
+ "You can get list of the current Trials with the latest status."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Trial Name: cmaes-example-dd4x6tsh\n",
+ "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n",
+ " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n",
+ " 'message': 'Trial is running',\n",
+ " 'reason': 'TrialRunning',\n",
+ " 'status': 'True',\n",
+ " 'type': 'Running'}\n",
+ "\n",
+ "Trial Name: cmaes-example-f64n8vb5\n",
+ "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n",
+ " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n",
+ " 'message': 'Trial has succeeded',\n",
+ " 'reason': 'TrialSucceeded',\n",
+ " 'status': 'True',\n",
+ " 'type': 'Succeeded'}\n",
+ "\n",
+ "Trial Name: cmaes-example-l6zkx5jx\n",
+ "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 45, tzinfo=tzlocal()),\n",
+ " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 45, tzinfo=tzlocal()),\n",
+ " 'message': 'Trial has succeeded',\n",
+ " 'reason': 'TrialSucceeded',\n",
+ " 'status': 'True',\n",
+ " 'type': 'Succeeded'}\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Trial list.\n",
+ "trial_list = kclient.list_trials(experiment_name=experiment_name, namespace=namespace)\n",
+ "for trial in trial_list:\n",
+ " print(f\"Trial Name: {trial.metadata.name}\")\n",
+ " print(f\"Trial Status: {trial.status.conditions[-1]}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Get the optimal HyperParameters\n",
+ "\n",
+ "You can get the current optimal Trial from your Experiment. For the each metric you can see the max, min and latest value."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'best_trial_name': 'cmaes-example-l6zkx5jx',\n",
+ " 'observation': {'metrics': [{'latest': '0.955613',\n",
+ " 'max': '0.955613',\n",
+ " 'min': '0.955613',\n",
+ " 'name': 'Validation-accuracy'},\n",
+ " {'latest': '0.922775',\n",
+ " 'max': '0.922775',\n",
+ " 'min': '0.922775',\n",
+ " 'name': 'Train-accuracy'}]},\n",
+ " 'parameter_assignments': [{'name': 'lr', 'value': '0.04511033252270099'},\n",
+ " {'name': 'num-layers', 'value': '3'},\n",
+ " {'name': 'optimizer', 'value': 'sgd'}]}"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Optimal HPs.\n",
+ "kclient.get_optimal_hyperparameters(name=experiment_name, namespace=namespace)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Status for the Suggestion objects\n",
+ "\n",
+ "Once Experiment is Succeeded, you can check the Suggestion object status for more information about resume status.\n",
+ "\n",
+ "For Experiment with FromVolume you should be able to check created PVC."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Suggestion is succeeded, can't be restarted\n",
+ "-----------------\n",
+ "Suggestion is succeeded, suggestion volume is not deleted, can be restarted\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Get the current Suggestion status for the never resume Experiment.\n",
+ "suggestion = kclient.get_suggestion(name=experiment_never_resume_name, namespace=namespace)\n",
+ "\n",
+ "print(suggestion.status.conditions[-1].message)\n",
+ "print(\"-----------------\")\n",
+ "\n",
+ "# Get the current Suggestion status for the from volume Experiment.\n",
+ "suggestion = kclient.get_suggestion(name=experiment_from_volume_resume_name, namespace=namespace)\n",
+ "\n",
+ "print(suggestion.status.conditions[-1].message)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Delete your Experiments\n",
+ "\n",
+ "You can delete your Experiments."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Experiment kubeflow-user-example-com/cmaes-example has been deleted\n",
+ "Experiment kubeflow-user-example-com/never-resume-cmaes has been deleted\n",
+ "Experiment kubeflow-user-example-com/from-volume-resume-cmaes has been deleted\n"
+ ]
+ }
+ ],
+ "source": [
+ "kclient.delete_experiment(name=experiment_name, namespace=namespace)\n",
+ "kclient.delete_experiment(name=experiment_never_resume_name, namespace=namespace)\n",
+ "kclient.delete_experiment(name=experiment_from_volume_resume_name, namespace=namespace)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb b/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb
new file mode 100644
index 00000000000..7152602e595
--- /dev/null
+++ b/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb
@@ -0,0 +1,606 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "bf9ab16d-fbf6-4385-a7f8-133e4562e1e7",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "# Tune and Train with Kubeflow Katib and Training Operator\n",
+ " \n",
+ "In this Notebook we are going to do the following:\n",
+ "\n",
+ "- Train Tensorflow model using Kubeflow Notebook.\n",
+ "- Improve the model HyperParameters with [Kubeflow Katib](https://www.kubeflow.org/docs/components/katib/overview/).\n",
+ "- Use [Multi Worker Mirrored Strategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) to distributively train the model with [Kubeflow TFJob](https://www.kubeflow.org/docs/components/training/tftraining/)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "62d91e3d-904a-4a3c-b4e7-573324ba625e",
+ "metadata": {},
+ "source": [
+ "## Install Kubeflow Python SDKs\n",
+ "\n",
+ "You need to install Tensorflow package and Kubeflow SDKs to run this Notebook."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9c1a4c93",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!pip install tensorflow==2.16.1\n",
+ "\n",
+ "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
+ "!pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1\n",
+ "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b52ba56d-7f61-44b5-90d6-f7e3de79f596",
+ "metadata": {
+ "editable": true,
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": [
+ "parameters"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Experiment namespace\n",
+ "namespace = \"default\" "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "11835208",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Create Train Script for CNN Model\n",
+ "\n",
+ "This is simple **Convolutional Neural Network (CNN)** model for recognizing hand-written digits using [MNIST Dataset](http://yann.lecun.com/exdb/mnist/). "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e1b7f76",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def train_mnist_model(parameters):\n",
+ " import tensorflow as tf\n",
+ " import numpy as np\n",
+ " import logging\n",
+ "\n",
+ " logging.basicConfig(\n",
+ " format=\"%(asctime)s %(levelname)-8s %(message)s\",\n",
+ " datefmt=\"%Y-%m-%dT%H:%M:%SZ\",\n",
+ " level=logging.INFO,\n",
+ " )\n",
+ " logging.info(\"--------------------------------------------------------------------------------------\")\n",
+ " logging.info(f\"Input Parameters: {parameters}\")\n",
+ " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n",
+ "\n",
+ "\n",
+ " # Get HyperParameters from the input params dict.\n",
+ " lr = float(parameters[\"lr\"])\n",
+ " num_epoch = int(parameters[\"num_epoch\"])\n",
+ "\n",
+ " # Set dist parameters and strategy.\n",
+ " is_dist = parameters[\"is_dist\"]\n",
+ " num_workers = parameters[\"num_workers\"]\n",
+ " batch_size_per_worker = 64\n",
+ " batch_size_global = batch_size_per_worker * num_workers\n",
+ " strategy = tf.distribute.MultiWorkerMirroredStrategy(\n",
+ " communication_options=tf.distribute.experimental.CommunicationOptions(\n",
+ " implementation=tf.distribute.experimental.CollectiveCommunication.RING\n",
+ " )\n",
+ " )\n",
+ "\n",
+ " # Callback class for logging training.\n",
+ " # Katib parses metrics in this format: =.\n",
+ " class CustomCallback(tf.keras.callbacks.Callback):\n",
+ " def on_epoch_end(self, epoch, logs=None):\n",
+ " logging.info(\n",
+ " \"Epoch {}/{}. accuracy={:.4f} - loss={:.4f}\".format(\n",
+ " epoch+1, num_epoch, logs[\"accuracy\"], logs[\"loss\"]\n",
+ " )\n",
+ " )\n",
+ "\n",
+ " # Prepare MNIST Dataset.\n",
+ " def mnist_dataset(batch_size):\n",
+ " (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()\n",
+ " x_train = x_train / np.float32(255)\n",
+ " y_train = y_train.astype(np.int64)\n",
+ " train_dataset = (\n",
+ " tf.data.Dataset.from_tensor_slices((x_train, y_train))\n",
+ " .shuffle(60000)\n",
+ " .repeat()\n",
+ " .batch(batch_size)\n",
+ " )\n",
+ " return train_dataset\n",
+ "\n",
+ " # Build and compile CNN Model.\n",
+ " def build_and_compile_cnn_model():\n",
+ " model = tf.keras.Sequential(\n",
+ " [\n",
+ " tf.keras.layers.InputLayer(input_shape=(28, 28)),\n",
+ " tf.keras.layers.Reshape(target_shape=(28, 28, 1)),\n",
+ " tf.keras.layers.Conv2D(32, 3, activation=\"relu\"),\n",
+ " tf.keras.layers.Flatten(),\n",
+ " tf.keras.layers.Dense(128, activation=\"relu\"),\n",
+ " tf.keras.layers.Dense(10),\n",
+ " ]\n",
+ " )\n",
+ " model.compile(\n",
+ " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
+ " optimizer=tf.keras.optimizers.SGD(learning_rate=lr),\n",
+ " metrics=[\"accuracy\"],\n",
+ " )\n",
+ " return model\n",
+ " \n",
+ " # Download Dataset.\n",
+ " dataset = mnist_dataset(batch_size_global)\n",
+ "\n",
+ " # For dist strategy we should build model under scope().\n",
+ " if is_dist:\n",
+ " logging.info(\"Running Distributed Training\")\n",
+ " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n",
+ " with strategy.scope():\n",
+ " model = build_and_compile_cnn_model()\n",
+ " else:\n",
+ " logging.info(\"Running Single Worker Training\")\n",
+ " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n",
+ " model = build_and_compile_cnn_model()\n",
+ " \n",
+ " # Start Training.\n",
+ " model.fit(\n",
+ " dataset,\n",
+ " epochs=num_epoch,\n",
+ " steps_per_epoch=70,\n",
+ " callbacks=[CustomCallback()],\n",
+ " verbose=0,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f6f5db0c",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Run Training Locally in the Notebook\n",
+ "\n",
+ "We are going to download MNIST Dataset and start local training.\n",
+ "\n",
+ "Also, set `Epochs = 2` to reduce training time and avoid CPU overload. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ba6ea6b4",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# Set Parameters for Local Training.\n",
+ "parameters = {\n",
+ " \"lr\": \"0.1\",\n",
+ " \"num_epoch\": \"2\",\n",
+ " \"is_dist\": False,\n",
+ " \"num_workers\": 1\n",
+ "}\n",
+ "\n",
+ "# Train Model locally in the Notebook.\n",
+ "train_mnist_model(parameters)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "23e12963",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Start Model Tuning with Katib\n",
+ "\n",
+ "If you want to improve your model, you can run HyperParameter tuning with Katib.\n",
+ "\n",
+ "The following example uses **Covariance Matrix Adaptation Evolution Strategy (CMA-ES)** algorithm to tune HyperParameters.\n",
+ "\n",
+ "We are going to tune `learning rate` and `number of epochs`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "25599637",
+ "metadata": {
+ "collapsed": false,
+ "editable": true,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import kubeflow.katib as katib\n",
+ "\n",
+ "# Set parameters with their distribution for HyperParameter Tuning with Katib.\n",
+ "parameters = {\n",
+ " \"lr\": katib.search.double(min=0.1, max=0.2),\n",
+ " \"num_epoch\": katib.search.int(min=10, max=15),\n",
+ " \"is_dist\": False,\n",
+ " \"num_workers\": 1\n",
+ "}\n",
+ "\n",
+ "# Start the Katib Experiment.\n",
+ "exp_name = \"tune-mnist\"\n",
+ "katib_client = katib.KatibClient(namespace=namespace)\n",
+ "\n",
+ "katib_client.tune(\n",
+ " name=exp_name,\n",
+ " objective=train_mnist_model, # Objective function.\n",
+ " parameters=parameters, # HyperParameters to tune.\n",
+ " algorithm_name=\"cmaes\", # Alorithm to use.\n",
+ " objective_metric_name=\"accuracy\", # Katib is going to optimize \"accuracy\".\n",
+ " additional_metric_names=[\"loss\"], # Katib is going to collect these metrics in addition to the objective metric.\n",
+ " max_trial_count=12, # Trial Threshold.\n",
+ " parallel_trial_count=2,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f01b7f6d",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Access to Katib UI\n",
+ "\n",
+ "You can check created Experiment in the Katib UI.\n",
+ "\n",
+ "![Screenshot 2022-09-12 at 20.06.23.png](attachment:cdaf463d-28b3-4a98-bb4c-9613ca1bfa50.png)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c35b8d00",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Get the Best HyperParameters from the Katib Experiment\n",
+ "\n",
+ "You can get the best HyperParameters from the most optimal Katib Trial."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0c7c92b8",
+ "metadata": {
+ "collapsed": false,
+ "editable": true,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "katib_client.wait_for_experiment_condition(exp_name, namespace=namespace)\n",
+ "status = katib_client.is_experiment_succeeded(exp_name, namespace=namespace)\n",
+ "print(f\"Katib Experiment is Succeeded: {status}\\n\")\n",
+ "\n",
+ "best_hps = katib_client.get_optimal_hyperparameters(exp_name, namespace=namespace)\n",
+ "\n",
+ "if best_hps != None:\n",
+ " print(\"Current Optimal Trial\\n\")\n",
+ " print(best_hps)\n",
+ " \n",
+ " for hp in best_hps.parameter_assignments:\n",
+ " if hp.name == \"lr\":\n",
+ " best_lr = hp.value\n",
+ " else:\n",
+ " best_num_epoch = hp.value"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f1626054",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Start Distributive Training with TFJob\n",
+ "\n",
+ "Use the best HyperParameters (`learning rate` and `number of epochs`) from the Katib Experiment and run the TFJob."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "04d8bdb0",
+ "metadata": {
+ "collapsed": false,
+ "editable": true,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "from kubeflow.training import TrainingClient\n",
+ "\n",
+ "# Set Parameters for Distributed Training with TFJob.\n",
+ "parameters = {\n",
+ " \"lr\": best_lr,\n",
+ " \"num_epoch\": best_num_epoch,\n",
+ " \"is_dist\": True,\n",
+ " \"num_workers\": 5\n",
+ "}\n",
+ "\n",
+ "# Start TFJob Training.\n",
+ "tfjob_name = \"train-mnist\"\n",
+ "tfjob_client = TrainingClient(namespace=namespace)\n",
+ "\n",
+ "#create_tfjob_from_func\n",
+ "tfjob_client.create_job(\n",
+ " name=tfjob_name,\n",
+ " namespace=namespace,\n",
+ " job_kind=\"TFJob\",\n",
+ " train_func=train_mnist_model,\n",
+ " parameters=parameters, # Input parameters for the train function.\n",
+ " num_workers=5, # How many TFJob Workers will be run.\n",
+ " base_image=\"tensorflow/tensorflow:2.10.0\", # Use TensorFlow image\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "589c6ec1",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Get TFJob Status and Training Logs\n",
+ "\n",
+ "You can check the TFJob status and logs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aae9701d",
+ "metadata": {
+ "collapsed": false,
+ "editable": true,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "print(f\"TFJob status: {tfjob_client.get_job_conditions(tfjob_name, namespace=namespace, job_kind='TFJob')}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1ec788ab",
+ "metadata": {
+ "collapsed": false,
+ "editable": true,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "tfjob_client.get_job_logs(name=tfjob_name, namespace=namespace, is_master=True, follow=True, job_kind=\"TFJob\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5d4fdc13",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Delete Katib Experiment and TFJob\n",
+ "\n",
+ "When jobs are finished, you can delete the resources."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7453ca6c",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "katib_client.delete_experiment(exp_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5d56c24f",
+ "metadata": {
+ "collapsed": false,
+ "editable": true,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "tfjob_client.delete_job(tfjob_name, namespace=namespace, job_kind=\"TFJob\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e238a638-cf77-423f-a346-f763fc8b1582",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/v1beta1/sdk/tune-train-from-func.ipynb b/examples/v1beta1/sdk/tune-train-from-func.ipynb
index d42cbad4312..a0b7426820b 100644
--- a/examples/v1beta1/sdk/tune-train-from-func.ipynb
+++ b/examples/v1beta1/sdk/tune-train-from-func.ipynb
@@ -4,13 +4,6 @@
"cell_type": "markdown",
"id": "bf9ab16d-fbf6-4385-a7f8-133e4562e1e7",
"metadata": {
- "editable": true,
- "pycharm": {
- "name": "#%% md\n"
- },
- "slideshow": {
- "slide_type": ""
- },
"tags": []
},
"source": [
@@ -26,11 +19,7 @@
{
"cell_type": "markdown",
"id": "62d91e3d-904a-4a3c-b4e7-573324ba625e",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
+ "metadata": {},
"source": [
"## Install Kubeflow Python SDKs\n",
"\n",
@@ -40,20 +29,20 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "5de885ca-e96a-4d59-9e78-75f6fc6f5ce7",
+ "id": "9c1a4c93",
"metadata": {
- "editable": true,
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
"pycharm": {
"name": "#%%\n"
- },
- "slideshow": {
- "slide_type": ""
- },
- "tags": []
+ }
},
"outputs": [],
"source": [
"!pip install tensorflow==2.16.1\n",
+ "\n",
"# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
"!pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1\n",
"!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python"
@@ -62,16 +51,9 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "b807dfbb",
+ "id": "b52ba56d-7f61-44b5-90d6-f7e3de79f596",
"metadata": {
- "collapsed": false,
"editable": true,
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- },
"slideshow": {
"slide_type": ""
},
@@ -81,13 +63,13 @@
},
"outputs": [],
"source": [
- "# Experiment namespace.\n",
+ "# Experiment namespace\n",
"namespace = \"default\" "
]
},
{
"cell_type": "markdown",
- "id": "9f319483",
+ "id": "11835208",
"metadata": {
"collapsed": false,
"jupyter": {
@@ -106,7 +88,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "727ec914",
+ "id": "0e1b7f76",
"metadata": {
"collapsed": false,
"jupyter": {
@@ -216,7 +198,7 @@
},
{
"cell_type": "markdown",
- "id": "4b8a6dde",
+ "id": "f6f5db0c",
"metadata": {
"collapsed": false,
"jupyter": {
@@ -237,7 +219,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "18e13f06",
+ "id": "ba6ea6b4",
"metadata": {
"collapsed": false,
"jupyter": {
@@ -263,7 +245,7 @@
},
{
"cell_type": "markdown",
- "id": "e73eb9a5",
+ "id": "23e12963",
"metadata": {
"collapsed": false,
"jupyter": {
@@ -286,15 +268,20 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "841345df",
+ "id": "25599637",
"metadata": {
"collapsed": false,
+ "editable": true,
"jupyter": {
"outputs_hidden": false
},
"pycharm": {
"name": "#%%\n"
- }
+ },
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": []
},
"outputs": [],
"source": [
@@ -326,7 +313,7 @@
},
{
"cell_type": "markdown",
- "id": "13fb2f5c",
+ "id": "f01b7f6d",
"metadata": {
"collapsed": false,
"jupyter": {
@@ -346,7 +333,7 @@
},
{
"cell_type": "markdown",
- "id": "f24c9fd7",
+ "id": "c35b8d00",
"metadata": {
"collapsed": false,
"jupyter": {
@@ -365,16 +352,6 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "964e2f4c",
- "metadata": {
- "collapsed": false,
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
"outputs": [],
"source": [
"katib_client.wait_for_experiment_condition(exp_name, namespace=namespace)\n",
@@ -392,34 +369,32 @@
" best_lr = hp.value\n",
" else:\n",
" best_num_epoch = hp.value"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "6289e27f-325d-4433-9379-7e97bc8aae69",
+ ],
"metadata": {
- "execution": {
- "iopub.execute_input": "2022-09-12T19:08:51.055746Z",
- "iopub.status.busy": "2022-09-12T19:08:51.054605Z",
- "iopub.status.idle": "2022-09-12T19:08:51.246141Z",
- "shell.execute_reply": "2022-09-12T19:08:51.244919Z",
- "shell.execute_reply.started": "2022-09-12T19:08:51.055713Z"
- },
+ "collapsed": false,
"pycharm": {
"name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2022-09-12T19:08:51Z INFO TFJob train-mnist has been created\n"
- ]
}
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## Start Distributive Training with TFJob\n",
+ "\n",
+ "Use the best HyperParameters (`learning rate` and `number of epochs`) from the Katib Experiment and run the TFJob."
],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "outputs": [],
"source": [
"from kubeflow.training import TrainingClient\n",
"\n",
@@ -445,294 +420,103 @@
" num_workers=5, # How many TFJob Workers will be run.\n",
" base_image=\"tensorflow/tensorflow:2.10.0\", # Use TensorFlow image\n",
")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d5d465e8-0310-4c72-ad36-209259ad5c34",
+ ],
"metadata": {
+ "collapsed": false,
"pycharm": {
- "name": "#%% md\n"
+ "name": "#%%\n"
}
- },
+ }
+ },
+ {
+ "cell_type": "markdown",
"source": [
"### Get TFJob Status and Training Logs\n",
"\n",
"You can check the TFJob status and logs."
- ]
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ }
},
{
"cell_type": "code",
- "execution_count": 16,
- "id": "53859cf4-7a35-4fc4-b5ee-9ba774635df0",
+ "execution_count": null,
+ "outputs": [],
+ "source": [
+ "print(f\"TFJob status: {tfjob_client.get_job_conditions(tfjob_name, namespace=namespace, job_kind='TFJob')}\")"
+ ],
"metadata": {
- "execution": {
- "iopub.execute_input": "2022-09-12T19:10:06.862146Z",
- "iopub.status.busy": "2022-09-12T19:10:06.861177Z",
- "iopub.status.idle": "2022-09-12T19:10:06.945011Z",
- "shell.execute_reply": "2022-09-12T19:10:06.943629Z",
- "shell.execute_reply.started": "2022-09-12T19:10:06.862104Z"
- },
+ "collapsed": false,
"pycharm": {
"name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "TFJob status: Succeeded\n"
- ]
}
- ],
- "source": [
- "print(f\"TFJob status: {tfjob_client.get_job_conditions(tfjob_name, namespace=namespace, job_kind='TFJob')}\")"
- ]
+ }
},
{
"cell_type": "code",
- "execution_count": 17,
- "id": "f247670e-0bd4-4336-a40c-605ce32fad23",
+ "execution_count": null,
+ "outputs": [],
+ "source": [
+ "tfjob_client.get_job_logs(name=tfjob_name, namespace=namespace, is_master=True, follow=True, job_kind=\"TFJob\")"
+ ],
"metadata": {
- "execution": {
- "iopub.execute_input": "2022-09-12T19:10:11.765592Z",
- "iopub.status.busy": "2022-09-12T19:10:11.764384Z",
- "iopub.status.idle": "2022-09-12T19:10:14.249858Z",
- "shell.execute_reply": "2022-09-12T19:10:14.248518Z",
- "shell.execute_reply.started": "2022-09-12T19:10:11.765560Z"
- },
+ "collapsed": false,
"pycharm": {
"name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO --------------------------------------------------------------------------------------\n",
- "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO Input Parameters: {'lr': '0.17016692449867332', 'num_epoch': '13', 'is_dist': True, 'num_workers': 5}\n",
- "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO --------------------------------------------------------------------------------------\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:53.988515: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.008619: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> train-mnist-worker-0.kubeflow-andrey.svc:2222, 1 -> train-mnist-worker-1.kubeflow-andrey.svc:2222, 2 -> train-mnist-worker-2.kubeflow-andrey.svc:2222, 3 -> train-mnist-worker-3.kubeflow-andrey.svc:2222, 4 -> train-mnist-worker-4.kubeflow-andrey.svc:2222}\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.008700: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> train-mnist-worker-0.kubeflow-andrey.svc:2222, 1 -> train-mnist-worker-1.kubeflow-andrey.svc:2222, 2 -> train-mnist-worker-2.kubeflow-andrey.svc:2222, 3 -> train-mnist-worker-3.kubeflow-andrey.svc:2222, 4 -> train-mnist-worker-4.kubeflow-andrey.svc:2222}\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.009579: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:438] Started server with target: grpc://train-mnist-worker-0.kubeflow-andrey.svc:2222\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:4/device:CPU:0', '/job:worker/replica:0/task:2/device:CPU:0', '/job:worker/replica:0/task:3/device:CPU:0', '/job:worker/replica:0/task:1/device:CPU:0']\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:4/device:CPU:0', '/job:worker/replica:0/task:2/device:CPU:0', '/job:worker/replica:0/task:3/device:CPU:0', '/job:worker/replica:0/task:1/device:CPU:0']\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Check health not enabled.\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO Check health not enabled.\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['train-mnist-worker-0.kubeflow-andrey.svc:2222', 'train-mnist-worker-1.kubeflow-andrey.svc:2222', 'train-mnist-worker-2.kubeflow-andrey.svc:2222', 'train-mnist-worker-3.kubeflow-andrey.svc:2222', 'train-mnist-worker-4.kubeflow-andrey.svc:2222']}, task_type = 'worker', task_id = 0, num_workers = 5, local_devices = ('/job:worker/task:0/device:CPU:0',), communication = CommunicationImplementation.RING\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['train-mnist-worker-0.kubeflow-andrey.svc:2222', 'train-mnist-worker-1.kubeflow-andrey.svc:2222', 'train-mnist-worker-2.kubeflow-andrey.svc:2222', 'train-mnist-worker-3.kubeflow-andrey.svc:2222', 'train-mnist-worker-4.kubeflow-andrey.svc:2222']}, task_type = 'worker', task_id = 0, num_workers = 5, local_devices = ('/job:worker/task:0/device:CPU:0',), communication = CommunicationImplementation.RING\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\n",
- "11490434/11490434 [==============================] - 0s 0us/step\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:56Z INFO Running Distributed Training\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:56Z INFO --------------------------------------------------------------------------------------\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:56.666389: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: \"TensorSliceDataset/_2\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: op: \"TensorSliceDataset\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: input: \"Placeholder/_0\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: input: \"Placeholder/_1\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"Toutput_types\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: list {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: type: DT_FLOAT\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: type: DT_INT64\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"_cardinality\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: i: 60000\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"is_files\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: b: false\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"metadata\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: s: \"\\n\\024TensorSliceDataset:0\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"output_shapes\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: list {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: shape {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: dim {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: size: 28\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: dim {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: size: 28\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: shape {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: experimental_type {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_PRODUCT\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_DATASET\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_PRODUCT\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_TENSOR\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_FLOAT\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_TENSOR\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_INT64\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:56.901683: W tensorflow/core/framework/dataset.cc:768] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:04Z INFO Epoch 1/13. accuracy=0.7755 - loss=0.7565\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:09Z INFO Epoch 2/13. accuracy=0.9104 - loss=0.2964\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:13Z INFO Epoch 3/13. accuracy=0.9371 - loss=0.2100\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:18Z INFO Epoch 4/13. accuracy=0.9475 - loss=0.1756\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:23Z INFO Epoch 5/13. accuracy=0.9505 - loss=0.1612\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:27Z INFO Epoch 6/13. accuracy=0.9608 - loss=0.1309\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:32Z INFO Epoch 7/13. accuracy=0.9613 - loss=0.1298\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:37Z INFO Epoch 8/13. accuracy=0.9645 - loss=0.1165\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:41Z INFO Epoch 9/13. accuracy=0.9717 - loss=0.0962\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:46Z INFO Epoch 10/13. accuracy=0.9719 - loss=0.0920\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:51Z INFO Epoch 11/13. accuracy=0.9743 - loss=0.0873\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:55Z INFO Epoch 12/13. accuracy=0.9751 - loss=0.0831\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:10:00Z INFO Epoch 13/13. accuracy=0.9765 - loss=0.0803\n"
- ]
}
- ],
- "source": [
- "tfjob_client.get_job_logs(name=tfjob_name, namespace=namespace, is_master=True, follow=True, job_kind=\"TFJob\")"
- ]
+ }
},
{
"cell_type": "markdown",
- "id": "227c0a9a-fdf5-4047-b0e2-ec15d3c120ac",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2022-08-09T23:50:29.596391Z",
- "iopub.status.busy": "2022-08-09T23:50:29.596145Z",
- "iopub.status.idle": "2022-08-09T23:50:29.599222Z",
- "shell.execute_reply": "2022-08-09T23:50:29.598674Z",
- "shell.execute_reply.started": "2022-08-09T23:50:29.596363Z"
- },
- "pycharm": {
- "name": "#%% md\n"
- }
- },
"source": [
"## Delete Katib Experiment and TFJob\n",
"\n",
"When jobs are finished, you can delete the resources."
- ]
+ ],
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ }
},
{
"cell_type": "code",
- "execution_count": 18,
- "id": "dd24acd8-4305-463e-a6e6-eed16d8a7c51",
+ "execution_count": null,
+ "outputs": [],
+ "source": [
+ "katib_client.delete_experiment(exp_name)"
+ ],
"metadata": {
- "execution": {
- "iopub.execute_input": "2022-09-12T19:10:19.053646Z",
- "iopub.status.busy": "2022-09-12T19:10:19.052424Z",
- "iopub.status.idle": "2022-09-12T19:10:19.144593Z",
- "shell.execute_reply": "2022-09-12T19:10:19.143396Z",
- "shell.execute_reply.started": "2022-09-12T19:10:19.053607Z"
- },
+ "collapsed": false,
"pycharm": {
"name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Experiment tune-mnist has been deleted\n"
- ]
}
- ],
- "source": [
- "katib_client.delete_experiment(exp_name)"
- ]
+ }
},
{
"cell_type": "code",
- "execution_count": 19,
- "id": "025fa4af-256d-4027-99ba-ba44c1409541",
+ "execution_count": null,
+ "outputs": [],
+ "source": [
+ "tfjob_client.delete_job(tfjob_name, namespace=namespace, job_kind=\"TFJob\")"
+ ],
"metadata": {
- "execution": {
- "iopub.execute_input": "2022-09-12T19:10:19.532471Z",
- "iopub.status.busy": "2022-09-12T19:10:19.531949Z",
- "iopub.status.idle": "2022-09-12T19:10:19.550331Z",
- "shell.execute_reply": "2022-09-12T19:10:19.549103Z",
- "shell.execute_reply.started": "2022-09-12T19:10:19.532441Z"
- },
+ "collapsed": false,
"pycharm": {
"name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2022-09-12T19:10:19Z INFO TFJob train-mnist has been deleted\n"
- ]
}
- ],
- "source": [
- "tfjob_client.delete_job(tfjob_name, namespace=namespace, job_kind=\"TFJob\")"
- ]
+ }
},
{
"cell_type": "code",
"execution_count": null,
"id": "e238a638-cf77-423f-a346-f763fc8b1582",
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
+ "metadata": {},
"outputs": [],
"source": []
}