diff --git a/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb b/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb
deleted file mode 100644
index be8737ee8f8..00000000000
--- a/examples/v1beta1/sdk/.ipynb_checkpoints/cmaes-and-resume-policies-checkpoint.ipynb
+++ /dev/null
@@ -1,977 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "# HyperParameter tunning using CMA-ES\n",
- "\n",
- "In this example you will deploy 3 Katib Experiments with Covariance Matrix Adaptation Evolution Strategy (CMA-ES) using Jupyter Notebook and Katib SDK. These Experiments have various resume policies.\n",
- "\n",
- "Reference documentation:\n",
- "- https://www.kubeflow.org/docs/components/katib/experiment/#cmaes\n",
- "- https://www.kubeflow.org/docs/components/katib/resume-experiment/\n",
- "\n",
- "The notebook shows how to create, get, check status and delete an Experiment."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Install Katib SDK\n",
- "\n",
- "You need to install Katib SDK to run this Notebook."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [],
- "source": [
- "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
- "!pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Import required packages"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [],
- "source": [
- "import copy\n",
- "\n",
- "from kubeflow.katib import KatibClient\n",
- "from kubernetes.client import V1ObjectMeta\n",
- "from kubeflow.katib import V1beta1Experiment\n",
- "from kubeflow.katib import V1beta1AlgorithmSpec\n",
- "from kubeflow.katib import V1beta1ObjectiveSpec\n",
- "from kubeflow.katib import V1beta1FeasibleSpace\n",
- "from kubeflow.katib import V1beta1ExperimentSpec\n",
- "from kubeflow.katib import V1beta1ObjectiveSpec\n",
- "from kubeflow.katib import V1beta1ParameterSpec\n",
- "from kubeflow.katib import V1beta1TrialTemplate\n",
- "from kubeflow.katib import V1beta1TrialParameterSpec"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Define your Experiment\n",
- "\n",
- "You have to create your Experiment object before deploying it. This Experiment is similar to [this](https://github.com/kubeflow/katib/blob/master/examples/v1beta1/hp-tuning/cma-es.yaml) example."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "outputs": [],
- "source": [
- "# Experiment name and namespace.\n",
- "namespace = \"kubeflow\"\n",
- "experiment_name = \"cmaes-example\"\n",
- "\n",
- "metadata = V1ObjectMeta(\n",
- " name=experiment_name,\n",
- " namespace=namespace\n",
- ")\n",
- "\n",
- "# Algorithm specification.\n",
- "algorithm_spec=V1beta1AlgorithmSpec(\n",
- " algorithm_name=\"cmaes\"\n",
- ")\n",
- "\n",
- "# Objective specification.\n",
- "objective_spec=V1beta1ObjectiveSpec(\n",
- " type=\"minimize\",\n",
- " goal= 0.001,\n",
- " objective_metric_name=\"loss\",\n",
- ")\n",
- "\n",
- "# Experiment search space. In this example we tune learning rate, number of layer and optimizer.\n",
- "parameters=[\n",
- " V1beta1ParameterSpec(\n",
- " name=\"lr\",\n",
- " parameter_type=\"double\",\n",
- " feasible_space=V1beta1FeasibleSpace(\n",
- " min=\"0.01\",\n",
- " max=\"0.06\"\n",
- " ),\n",
- " ),\n",
- " V1beta1ParameterSpec(\n",
- " name=\"momentum\",\n",
- " parameter_type=\"double\",\n",
- " feasible_space=V1beta1FeasibleSpace(\n",
- " min=\"0.5\",\n",
- " max=\"0.9\"\n",
- " ),\n",
- " ),\n",
- "]\n",
- "\n",
- "# JSON template specification for the Trial's Worker Kubernetes Job.\n",
- "trial_spec={\n",
- " \"apiVersion\": \"batch/v1\",\n",
- " \"kind\": \"Job\",\n",
- " \"spec\": {\n",
- " \"template\": {\n",
- " \"metadata\": {\n",
- " \"annotations\": {\n",
- " \"sidecar.istio.io/inject\": \"false\"\n",
- " }\n",
- " },\n",
- " \"spec\": {\n",
- " \"containers\": [\n",
- " {\n",
- " \"name\": \"training-container\",\n",
- " \"image\": \"docker.io/kubeflowkatib/pytorch-mnist-cpu:v0.14.0\",\n",
- " \"command\": [\n",
- " \"python3\",\n",
- " \"/opt/pytorch-mnist/mnist.py\",\n",
- " \"--epochs=1\",\n",
- " \"--batch-size=64\",\n",
- " \"--lr=${trialParameters.learningRate}\",\n",
- " \"--momentum=${trialParameters.momentum}\",\n",
- " ]\n",
- " }\n",
- " ],\n",
- " \"restartPolicy\": \"Never\"\n",
- " }\n",
- " }\n",
- " }\n",
- "}\n",
- "\n",
- "# Configure parameters for the Trial template.\n",
- "trial_template=V1beta1TrialTemplate(\n",
- " primary_container_name=\"training-container\",\n",
- " trial_parameters=[\n",
- " V1beta1TrialParameterSpec(\n",
- " name=\"learningRate\",\n",
- " description=\"Learning rate for the training model\",\n",
- " reference=\"lr\"\n",
- " ),\n",
- " V1beta1TrialParameterSpec(\n",
- " name=\"momentum\",\n",
- " description=\"Momentum for the training model\",\n",
- " reference=\"momentum\"\n",
- " ),\n",
- " ],\n",
- " trial_spec=trial_spec\n",
- ")\n",
- "\n",
- "\n",
- "# Experiment object.\n",
- "experiment = V1beta1Experiment(\n",
- " api_version=\"kubeflow.org/v1beta1\",\n",
- " kind=\"Experiment\",\n",
- " metadata=metadata,\n",
- " spec=V1beta1ExperimentSpec(\n",
- " max_trial_count=3,\n",
- " parallel_trial_count=2,\n",
- " max_failed_trial_count=1,\n",
- " algorithm=algorithm_spec,\n",
- " objective=objective_spec,\n",
- " parameters=parameters,\n",
- " trial_template=trial_template,\n",
- " )\n",
- ")"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [],
- "source": [
- "experiment_never_resume_name = \"never-resume-cmaes\"\n",
- "experiment_from_volume_resume_name = \"from-volume-resume-cmaes\"\n",
- "\n",
- "# Create new Experiments from the previous Experiment info.\n",
- "# Define Experiment with Never resume.\n",
- "experiment_never_resume = copy.deepcopy(experiment)\n",
- "experiment_never_resume.metadata.name = experiment_never_resume_name\n",
- "experiment_never_resume.spec.resume_policy = \"Never\"\n",
- "experiment_never_resume.spec.max_trial_count = 4\n",
- "\n",
- "# Define Experiment with FromVolume resume.\n",
- "experiment_from_volume_resume = copy.deepcopy(experiment)\n",
- "experiment_from_volume_resume.metadata.name = experiment_from_volume_resume_name\n",
- "experiment_from_volume_resume.spec.resume_policy = \"FromVolume\"\n",
- "experiment_from_volume_resume.spec.max_trial_count = 4"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "You can print the Experiment's info to verify it before submission."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "cmaes-example\n",
- "cmaes\n",
- "-----------------\n",
- "never-resume-cmaes\n",
- "Never\n",
- "-----------------\n",
- "from-volume-resume-cmaes\n",
- "FromVolume\n"
- ]
- }
- ],
- "source": [
- "print(experiment.metadata.name)\n",
- "print(experiment.spec.algorithm.algorithm_name)\n",
- "print(\"-----------------\")\n",
- "print(experiment_never_resume.metadata.name)\n",
- "print(experiment_never_resume.spec.resume_policy)\n",
- "print(\"-----------------\")\n",
- "print(experiment_from_volume_resume.metadata.name)\n",
- "print(experiment_from_volume_resume.spec.resume_policy)\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Create your Experiment\n",
- "\n",
- "You have to create Katib client to use the SDK."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Experiment kubeflow-user-example-com/cmaes-example has been created\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "Katib Experiment cmaes-example link here"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Initialize KatibClient\n",
- "kclient = KatibClient(namespace=namespace)\n",
- "\n",
- "# Create your Experiment.\n",
- "kclient.create_experiment(experiment,namespace=namespace)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "### Create other Experiments"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Experiment kubeflow-user-example-com/never-resume-cmaes has been created\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "Katib Experiment never-resume-cmaes link here"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Experiment kubeflow-user-example-com/from-volume-resume-cmaes has been created\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "Katib Experiment from-volume-resume-cmaes link here"
- ],
- "text/plain": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Create Experiment with never resume.\n",
- "kclient.create_experiment(experiment_never_resume,namespace=namespace)\n",
- "# Create Experiment with from volume resume.\n",
- "kclient.create_experiment(experiment_from_volume_resume,namespace=namespace)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Get your Experiment\n",
- "\n",
- "You can get your Experiment by name and receive required data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'api_version': 'kubeflow.org/v1beta1',\n",
- " 'kind': 'Experiment',\n",
- " 'metadata': {'annotations': None,\n",
- " 'creation_timestamp': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
- " 'deletion_grace_period_seconds': None,\n",
- " 'deletion_timestamp': None,\n",
- " 'finalizers': ['update-prometheus-metrics'],\n",
- " 'generate_name': None,\n",
- " 'generation': 1,\n",
- " 'labels': None,\n",
- " 'managed_fields': [{'api_version': 'kubeflow.org/v1beta1',\n",
- " 'fields_type': 'FieldsV1',\n",
- " 'fields_v1': {'f:spec': {'.': {},\n",
- " 'f:algorithm': {'.': {},\n",
- " 'f:algorithmName': {}},\n",
- " 'f:maxFailedTrialCount': {},\n",
- " 'f:maxTrialCount': {},\n",
- " 'f:objective': {'.': {},\n",
- " 'f:additionalMetricNames': {},\n",
- " 'f:goal': {},\n",
- " 'f:objectiveMetricName': {},\n",
- " 'f:type': {}},\n",
- " 'f:parallelTrialCount': {},\n",
- " 'f:parameters': {},\n",
- " 'f:trialTemplate': {'.': {},\n",
- " 'f:primaryContainerName': {},\n",
- " 'f:trialParameters': {},\n",
- " 'f:trialSpec': {'.': {},\n",
- " 'f:apiVersion': {},\n",
- " 'f:kind': {},\n",
- " 'f:spec': {'.': {},\n",
- " 'f:template': {'.': {},\n",
- " 'f:metadata': {'.': {},\n",
- " 'f:annotations': {'.': {},\n",
- " 'f:sidecar.istio.io/inject': {}}},\n",
- " 'f:spec': {'.': {},\n",
- " 'f:containers': {},\n",
- " 'f:restartPolicy': {}}}}}}}},\n",
- " 'manager': 'OpenAPI-Generator',\n",
- " 'operation': 'Update',\n",
- " 'subresource': None,\n",
- " 'time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal())},\n",
- " {'api_version': 'kubeflow.org/v1beta1',\n",
- " 'fields_type': 'FieldsV1',\n",
- " 'fields_v1': {'f:metadata': {'f:finalizers': {'.': {},\n",
- " 'v:\"update-prometheus-metrics\"': {}}}},\n",
- " 'manager': 'katib-controller',\n",
- " 'operation': 'Update',\n",
- " 'subresource': None,\n",
- " 'time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal())},\n",
- " {'api_version': 'kubeflow.org/v1beta1',\n",
- " 'fields_type': 'FieldsV1',\n",
- " 'fields_v1': {'f:status': {'.': {},\n",
- " 'f:conditions': {},\n",
- " 'f:currentOptimalTrial': {'.': {},\n",
- " 'f:observation': {}},\n",
- " 'f:runningTrialList': {},\n",
- " 'f:startTime': {},\n",
- " 'f:trials': {},\n",
- " 'f:trialsRunning': {}}},\n",
- " 'manager': 'katib-controller',\n",
- " 'operation': 'Update',\n",
- " 'subresource': 'status',\n",
- " 'time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal())}],\n",
- " 'name': 'cmaes-example',\n",
- " 'namespace': 'kubeflow-user-example-com',\n",
- " 'owner_references': None,\n",
- " 'resource_version': '26516',\n",
- " 'self_link': None,\n",
- " 'uid': '1d59819e-4e5f-4adc-90cc-62c2ee867f72'},\n",
- " 'spec': {'algorithm': {'algorithm_name': 'cmaes', 'algorithm_settings': None},\n",
- " 'early_stopping': None,\n",
- " 'max_failed_trial_count': 1,\n",
- " 'max_trial_count': 3,\n",
- " 'metrics_collector_spec': {'collector': {'custom_collector': None,\n",
- " 'kind': 'StdOut'},\n",
- " 'source': None},\n",
- " 'nas_config': None,\n",
- " 'objective': {'additional_metric_names': ['Train-accuracy'],\n",
- " 'goal': 0.99,\n",
- " 'metric_strategies': [{'name': 'Validation-accuracy',\n",
- " 'value': 'max'},\n",
- " {'name': 'Train-accuracy',\n",
- " 'value': 'max'}],\n",
- " 'objective_metric_name': 'Validation-accuracy',\n",
- " 'type': 'maximize'},\n",
- " 'parallel_trial_count': 2,\n",
- " 'parameters': [{'feasible_space': {'list': None,\n",
- " 'max': '0.06',\n",
- " 'min': '0.01',\n",
- " 'step': None},\n",
- " 'name': 'lr',\n",
- " 'parameter_type': 'double'},\n",
- " {'feasible_space': {'list': None,\n",
- " 'max': '5',\n",
- " 'min': '2',\n",
- " 'step': None},\n",
- " 'name': 'num-layers',\n",
- " 'parameter_type': 'int'},\n",
- " {'feasible_space': {'list': ['sgd', 'adam', 'ftrl'],\n",
- " 'max': None,\n",
- " 'min': None,\n",
- " 'step': None},\n",
- " 'name': 'optimizer',\n",
- " 'parameter_type': 'categorical'}],\n",
- " 'resume_policy': 'LongRunning',\n",
- " 'trial_template': {'config_map': None,\n",
- " 'failure_condition': 'status.conditions.#(type==\"Failed\")#|#(status==\"True\")#',\n",
- " 'primary_container_name': 'training-container',\n",
- " 'primary_pod_labels': None,\n",
- " 'retain': None,\n",
- " 'success_condition': 'status.conditions.#(type==\"Complete\")#|#(status==\"True\")#',\n",
- " 'trial_parameters': [{'description': 'Learning '\n",
- " 'rate for '\n",
- " 'the '\n",
- " 'training '\n",
- " 'model',\n",
- " 'name': 'learningRate',\n",
- " 'reference': 'lr'},\n",
- " {'description': 'Number of '\n",
- " 'training '\n",
- " 'model '\n",
- " 'layers',\n",
- " 'name': 'numberLayers',\n",
- " 'reference': 'num-layers'},\n",
- " {'description': 'Training '\n",
- " 'model '\n",
- " 'optimizer '\n",
- " '(sdg, adam '\n",
- " 'or ftrl)',\n",
- " 'name': 'optimizer',\n",
- " 'reference': 'optimizer'}],\n",
- " 'trial_spec': {'apiVersion': 'batch/v1',\n",
- " 'kind': 'Job',\n",
- " 'spec': {'template': {'metadata': {'annotations': {'sidecar.istio.io/inject': 'false'}},\n",
- " 'spec': {'containers': [{'command': ['python3',\n",
- " '/opt/mxnet-mnist/mnist.py',\n",
- " '--batch-size=64',\n",
- " '--num-epochs=1',\n",
- " '--lr=${trialParameters.learningRate}',\n",
- " '--num-layers=${trialParameters.numberLayers}',\n",
- " '--optimizer=${trialParameters.optimizer}'],\n",
- " 'image': 'docker.io/kubeflowkatib/mxnet-mnist:v0.14.0',\n",
- " 'name': 'training-container'}],\n",
- " 'restartPolicy': 'Never'}}}}}},\n",
- " 'status': {'completion_time': None,\n",
- " 'conditions': [{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
- " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
- " 'message': 'Experiment is created',\n",
- " 'reason': 'ExperimentCreated',\n",
- " 'status': 'True',\n",
- " 'type': 'Created'},\n",
- " {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
- " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
- " 'message': 'Experiment is running',\n",
- " 'reason': 'ExperimentRunning',\n",
- " 'status': 'True',\n",
- " 'type': 'Running'}],\n",
- " 'current_optimal_trial': {'best_trial_name': None,\n",
- " 'observation': {'metrics': None},\n",
- " 'parameter_assignments': None},\n",
- " 'early_stopped_trial_list': None,\n",
- " 'failed_trial_list': None,\n",
- " 'killed_trial_list': None,\n",
- " 'last_reconcile_time': None,\n",
- " 'metrics_unavailable_trial_list': None,\n",
- " 'pending_trial_list': None,\n",
- " 'running_trial_list': ['cmaes-example-f64n8vb5',\n",
- " 'cmaes-example-l6zkx5jx'],\n",
- " 'start_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
- " 'succeeded_trial_list': None,\n",
- " 'trial_metrics_unavailable': None,\n",
- " 'trials': 2,\n",
- " 'trials_early_stopped': None,\n",
- " 'trials_failed': None,\n",
- " 'trials_killed': None,\n",
- " 'trials_pending': None,\n",
- " 'trials_running': 2,\n",
- " 'trials_succeeded': None}}\n",
- "-----------------\n",
- "\n",
- "3\n",
- "{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
- " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
- " 'message': 'Experiment is running',\n",
- " 'reason': 'ExperimentRunning',\n",
- " 'status': 'True',\n",
- " 'type': 'Running'}\n"
- ]
- }
- ],
- "source": [
- "exp = kclient.get_experiment(name=experiment_name, namespace=namespace)\n",
- "print(exp)\n",
- "print(\"-----------------\\n\")\n",
- "\n",
- "# Get the max trial count and latest status.\n",
- "print(exp.spec.max_trial_count)\n",
- "print(exp.status.conditions[-1])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Get all Experiments\n",
- "\n",
- "You can get list of the current Experiments."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 52,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "cmaes-example\n",
- "from-volume-resume-cmaes\n",
- "never-resume-cmaes\n"
- ]
- }
- ],
- "source": [
- "# Get names from the running Experiments.\n",
- "exp_list = kclient.list_experiments(namespace=namespace)\n",
- "\n",
- "for exp in exp_list:\n",
- " print(exp.metadata.name)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Get the current Experiment conditions\n",
- "\n",
- "You can check the current Experiment conditions and check if Experiment is Succeeded."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 53,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[{'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
- " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 28, tzinfo=tzlocal()),\n",
- " 'message': 'Experiment is created',\n",
- " 'reason': 'ExperimentCreated',\n",
- " 'status': 'True',\n",
- " 'type': 'Created'},\n",
- " {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
- " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 28, 52, tzinfo=tzlocal()),\n",
- " 'message': 'Experiment is running',\n",
- " 'reason': 'ExperimentRunning',\n",
- " 'status': 'True',\n",
- " 'type': 'Running'}]"
- ]
- },
- "execution_count": 53,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "kclient.get_experiment_conditions(name=experiment_name, namespace=namespace)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 54,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "False"
- ]
- },
- "execution_count": 54,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "kclient.is_experiment_succeeded(name=experiment_name, namespace=namespace)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## List of the current Trials\n",
- "\n",
- "You can get list of the current Trials with the latest status."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 55,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- },
- "scrolled": true
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Trial Name: cmaes-example-dd4x6tsh\n",
- "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n",
- " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n",
- " 'message': 'Trial is running',\n",
- " 'reason': 'TrialRunning',\n",
- " 'status': 'True',\n",
- " 'type': 'Running'}\n",
- "\n",
- "Trial Name: cmaes-example-f64n8vb5\n",
- "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n",
- " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 43, tzinfo=tzlocal()),\n",
- " 'message': 'Trial has succeeded',\n",
- " 'reason': 'TrialSucceeded',\n",
- " 'status': 'True',\n",
- " 'type': 'Succeeded'}\n",
- "\n",
- "Trial Name: cmaes-example-l6zkx5jx\n",
- "Trial Status: {'last_transition_time': datetime.datetime(2023, 1, 6, 14, 30, 45, tzinfo=tzlocal()),\n",
- " 'last_update_time': datetime.datetime(2023, 1, 6, 14, 30, 45, tzinfo=tzlocal()),\n",
- " 'message': 'Trial has succeeded',\n",
- " 'reason': 'TrialSucceeded',\n",
- " 'status': 'True',\n",
- " 'type': 'Succeeded'}\n"
- ]
- }
- ],
- "source": [
- "# Trial list.\n",
- "trial_list = kclient.list_trials(experiment_name=experiment_name, namespace=namespace)\n",
- "for trial in trial_list:\n",
- " print(f\"Trial Name: {trial.metadata.name}\")\n",
- " print(f\"Trial Status: {trial.status.conditions[-1]}\\n\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Get the optimal HyperParameters\n",
- "\n",
- "You can get the current optimal Trial from your Experiment. For the each metric you can see the max, min and latest value."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'best_trial_name': 'cmaes-example-l6zkx5jx',\n",
- " 'observation': {'metrics': [{'latest': '0.955613',\n",
- " 'max': '0.955613',\n",
- " 'min': '0.955613',\n",
- " 'name': 'Validation-accuracy'},\n",
- " {'latest': '0.922775',\n",
- " 'max': '0.922775',\n",
- " 'min': '0.922775',\n",
- " 'name': 'Train-accuracy'}]},\n",
- " 'parameter_assignments': [{'name': 'lr', 'value': '0.04511033252270099'},\n",
- " {'name': 'num-layers', 'value': '3'},\n",
- " {'name': 'optimizer', 'value': 'sgd'}]}"
- ]
- },
- "execution_count": 56,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Optimal HPs.\n",
- "kclient.get_optimal_hyperparameters(name=experiment_name, namespace=namespace)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Status for the Suggestion objects\n",
- "\n",
- "Once Experiment is Succeeded, you can check the Suggestion object status for more information about resume status.\n",
- "\n",
- "For Experiment with FromVolume you should be able to check created PVC."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Suggestion is succeeded, can't be restarted\n",
- "-----------------\n",
- "Suggestion is succeeded, suggestion volume is not deleted, can be restarted\n"
- ]
- }
- ],
- "source": [
- "# Get the current Suggestion status for the never resume Experiment.\n",
- "suggestion = kclient.get_suggestion(name=experiment_never_resume_name, namespace=namespace)\n",
- "\n",
- "print(suggestion.status.conditions[-1].message)\n",
- "print(\"-----------------\")\n",
- "\n",
- "# Get the current Suggestion status for the from volume Experiment.\n",
- "suggestion = kclient.get_suggestion(name=experiment_from_volume_resume_name, namespace=namespace)\n",
- "\n",
- "print(suggestion.status.conditions[-1].message)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Delete your Experiments\n",
- "\n",
- "You can delete your Experiments."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Experiment kubeflow-user-example-com/cmaes-example has been deleted\n",
- "Experiment kubeflow-user-example-com/never-resume-cmaes has been deleted\n",
- "Experiment kubeflow-user-example-com/from-volume-resume-cmaes has been deleted\n"
- ]
- }
- ],
- "source": [
- "kclient.delete_experiment(name=experiment_name, namespace=namespace)\n",
- "kclient.delete_experiment(name=experiment_never_resume_name, namespace=namespace)\n",
- "kclient.delete_experiment(name=experiment_from_volume_resume_name, namespace=namespace)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.7"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
diff --git a/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb b/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb
deleted file mode 100644
index 547f069b545..00000000000
--- a/examples/v1beta1/sdk/.ipynb_checkpoints/tune-train-from-func-checkpoint.ipynb
+++ /dev/null
@@ -1,695 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "bf9ab16d-fbf6-4385-a7f8-133e4562e1e7",
- "metadata": {
- "editable": true,
- "pycharm": {
- "name": "#%% md\n"
- },
- "slideshow": {
- "slide_type": ""
- },
- "tags": []
- },
- "source": [
- "# Tune and Train with Kubeflow Katib and Training Operator\n",
- " \n",
- "In this Notebook we are going to do the following:\n",
- "\n",
- "- Train Tensorflow model using Kubeflow Notebook.\n",
- "- Improve the model HyperParameters with [Kubeflow Katib](https://www.kubeflow.org/docs/components/katib/overview/).\n",
- "- Use [Multi Worker Mirrored Strategy](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy) to distributively train the model with [Kubeflow TFJob](https://www.kubeflow.org/docs/components/training/tftraining/)."
- ]
- },
- {
- "cell_type": "markdown",
- "id": "62d91e3d-904a-4a3c-b4e7-573324ba625e",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Install Kubeflow Python SDKs\n",
- "\n",
- "You need to install Tensorflow package and Kubeflow SDKs to run this Notebook."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "5de885ca-e96a-4d59-9e78-75f6fc6f5ce7",
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [],
- "source": [
- "!pip install tensorflow==2.16.1\n",
- "# TODO (andreyvelich): Change to release version when SDK with the new APIs is published.\n",
- "!pip install git+https://github.com/kubeflow/katib.git#subdirectory=sdk/python/v1beta1\n",
- "!pip install git+https://github.com/kubeflow/training-operator.git#subdirectory=sdk/python"
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "## Create Train Script for CNN Model\n",
- "\n",
- "This is simple **Convolutional Neural Network (CNN)** model for recognizing hand-written digits using [MNIST Dataset](http://yann.lecun.com/exdb/mnist/). "
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%% md\n"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "outputs": [],
- "source": [
- "def train_mnist_model(parameters):\n",
- " import tensorflow as tf\n",
- " import numpy as np\n",
- " import logging\n",
- "\n",
- " logging.basicConfig(\n",
- " format=\"%(asctime)s %(levelname)-8s %(message)s\",\n",
- " datefmt=\"%Y-%m-%dT%H:%M:%SZ\",\n",
- " level=logging.INFO,\n",
- " )\n",
- " logging.info(\"--------------------------------------------------------------------------------------\")\n",
- " logging.info(f\"Input Parameters: {parameters}\")\n",
- " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n",
- "\n",
- "\n",
- " # Get HyperParameters from the input params dict.\n",
- " lr = float(parameters[\"lr\"])\n",
- " num_epoch = int(parameters[\"num_epoch\"])\n",
- "\n",
- " # Set dist parameters and strategy.\n",
- " is_dist = parameters[\"is_dist\"]\n",
- " num_workers = parameters[\"num_workers\"]\n",
- " batch_size_per_worker = 64\n",
- " batch_size_global = batch_size_per_worker * num_workers\n",
- " strategy = tf.distribute.MultiWorkerMirroredStrategy(\n",
- " communication_options=tf.distribute.experimental.CommunicationOptions(\n",
- " implementation=tf.distribute.experimental.CollectiveCommunication.RING\n",
- " )\n",
- " )\n",
- "\n",
- " # Callback class for logging training.\n",
- " # Katib parses metrics in this format: =.\n",
- " class CustomCallback(tf.keras.callbacks.Callback):\n",
- " def on_epoch_end(self, epoch, logs=None):\n",
- " logging.info(\n",
- " \"Epoch {}/{}. accuracy={:.4f} - loss={:.4f}\".format(\n",
- " epoch+1, num_epoch, logs[\"accuracy\"], logs[\"loss\"]\n",
- " )\n",
- " )\n",
- "\n",
- " # Prepare MNIST Dataset.\n",
- " def mnist_dataset(batch_size):\n",
- " (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()\n",
- " x_train = x_train / np.float32(255)\n",
- " y_train = y_train.astype(np.int64)\n",
- " train_dataset = (\n",
- " tf.data.Dataset.from_tensor_slices((x_train, y_train))\n",
- " .shuffle(60000)\n",
- " .repeat()\n",
- " .batch(batch_size)\n",
- " )\n",
- " return train_dataset\n",
- "\n",
- " # Build and compile CNN Model.\n",
- " def build_and_compile_cnn_model():\n",
- " model = tf.keras.Sequential(\n",
- " [\n",
- " tf.keras.layers.InputLayer(input_shape=(28, 28)),\n",
- " tf.keras.layers.Reshape(target_shape=(28, 28, 1)),\n",
- " tf.keras.layers.Conv2D(32, 3, activation=\"relu\"),\n",
- " tf.keras.layers.Flatten(),\n",
- " tf.keras.layers.Dense(128, activation=\"relu\"),\n",
- " tf.keras.layers.Dense(10),\n",
- " ]\n",
- " )\n",
- " model.compile(\n",
- " loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
- " optimizer=tf.keras.optimizers.SGD(learning_rate=lr),\n",
- " metrics=[\"accuracy\"],\n",
- " )\n",
- " return model\n",
- " \n",
- " # Download Dataset.\n",
- " dataset = mnist_dataset(batch_size_global)\n",
- "\n",
- " # For dist strategy we should build model under scope().\n",
- " if is_dist:\n",
- " logging.info(\"Running Distributed Training\")\n",
- " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n",
- " with strategy.scope():\n",
- " model = build_and_compile_cnn_model()\n",
- " else:\n",
- " logging.info(\"Running Single Worker Training\")\n",
- " logging.info(\"--------------------------------------------------------------------------------------\\n\\n\")\n",
- " model = build_and_compile_cnn_model()\n",
- " \n",
- " # Start Training.\n",
- " model.fit(\n",
- " dataset,\n",
- " epochs=num_epoch,\n",
- " steps_per_epoch=70,\n",
- " callbacks=[CustomCallback()],\n",
- " verbose=0,\n",
- " )"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "## Run Training Locally in the Notebook\n",
- "\n",
- "We are going to download MNIST Dataset and start local training.\n",
- "\n",
- "Also, set `Epochs = 2` to reduce training time and avoid CPU overload. "
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%% md\n"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "outputs": [],
- "source": [
- "# Set Parameters for Local Training.\n",
- "parameters = {\n",
- " \"lr\": \"0.1\",\n",
- " \"num_epoch\": \"2\",\n",
- " \"is_dist\": False,\n",
- " \"num_workers\": 1\n",
- "}\n",
- "\n",
- "# Train Model locally in the Notebook.\n",
- "train_mnist_model(parameters)"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "## Start Model Tuning with Katib\n",
- "\n",
- "If you want to improve your model, you can run HyperParameter tuning with Katib.\n",
- "\n",
- "The following example uses **Covariance Matrix Adaptation Evolution Strategy (CMA-ES)** algorithm to tune HyperParameters.\n",
- "\n",
- "We are going to tune `learning rate` and `number of epochs`."
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%% md\n"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "outputs": [],
- "source": [
- "import kubeflow.katib as katib\n",
- "\n",
- "# Set parameters with their distribution for HyperParameter Tuning with Katib.\n",
- "parameters = {\n",
- " \"lr\": katib.search.double(min=0.1, max=0.2),\n",
- " \"num_epoch\": katib.search.int(min=10, max=15),\n",
- " \"is_dist\": False,\n",
- " \"num_workers\": 1\n",
- "}\n",
- "\n",
- "# Start the Katib Experiment.\n",
- "exp_name = \"tune-mnist\"\n",
- "katib_client = katib.KatibClient(namespace=namespace)\n",
- "\n",
- "katib_client.tune(\n",
- " name=exp_name,\n",
- " objective=train_mnist_model, # Objective function.\n",
- " parameters=parameters, # HyperParameters to tune.\n",
- " algorithm_name=\"cmaes\", # Alorithm to use.\n",
- " objective_metric_name=\"accuracy\", # Katib is going to optimize \"accuracy\".\n",
- " additional_metric_names=[\"loss\"], # Katib is going to collect these metrics in addition to the objective metric.\n",
- " max_trial_count=12, # Trial Threshold.\n",
- " parallel_trial_count=2,\n",
- ")"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "### Access to Katib UI\n",
- "\n",
- "You can check created Experiment in the Katib UI.\n",
- "\n",
- "![Screenshot 2022-09-12 at 20.06.23.png](attachment:cdaf463d-28b3-4a98-bb4c-9613ca1bfa50.png)"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%% md\n"
- }
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "### Get the Best HyperParameters from the Katib Experiment\n",
- "\n",
- "You can get the best HyperParameters from the most optimal Katib Trial."
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%% md\n"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "outputs": [],
- "source": [
- "import time\n",
- "time.sleep(120)\n",
- "status = katib_client.is_experiment_succeeded(exp_name, namespace=namespace)\n",
- "print(f\"Katib Experiment is Succeeded: {status}\\n\")\n",
- "\n",
- "best_hps = katib_client.get_optimal_hyperparameters(exp_name, namespace=namespace)\n",
- "\n",
- "if best_hps != None:\n",
- " print(\"Current Optimal Trial\\n\")\n",
- " print(best_hps)\n",
- " \n",
- " for hp in best_hps.parameter_assignments:\n",
- " if hp.name == \"lr\":\n",
- " best_lr = hp.value\n",
- " else:\n",
- " best_num_epoch = hp.value"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "id": "6289e27f-325d-4433-9379-7e97bc8aae69",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2022-09-12T19:08:51.055746Z",
- "iopub.status.busy": "2022-09-12T19:08:51.054605Z",
- "iopub.status.idle": "2022-09-12T19:08:51.246141Z",
- "shell.execute_reply": "2022-09-12T19:08:51.244919Z",
- "shell.execute_reply.started": "2022-09-12T19:08:51.055713Z"
- },
- "pycharm": {
- "name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2022-09-12T19:08:51Z INFO TFJob train-mnist has been created\n"
- ]
- }
- ],
- "source": [
- "from kubeflow.training import TrainingClient\n",
- "\n",
- "# Set Parameters for Distributed Training with TFJob.\n",
- "parameters = {\n",
- " \"lr\": best_lr,\n",
- " \"num_epoch\": best_num_epoch,\n",
- " \"is_dist\": True,\n",
- " \"num_workers\": 5\n",
- "}\n",
- "\n",
- "# Start TFJob Training.\n",
- "tfjob_name = \"train-mnist\"\n",
- "tfjob_client = TrainingClient(namespace=namespace)\n",
- "\n",
- "#create_tfjob_from_func\n",
- "tfjob_client.create_job(\n",
- " name=tfjob_name,\n",
- " namespace=namespace,\n",
- " job_kind=\"TFJob\",\n",
- " train_func=train_mnist_model,\n",
- " parameters=parameters, # Input parameters for the train function.\n",
- " num_workers=5, # How many TFJob Workers will be run.\n",
- " base_image=\"tensorflow/tensorflow:2.10.0\", # Use TensorFlow image\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "d5d465e8-0310-4c72-ad36-209259ad5c34",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "### Get TFJob Status and Training Logs\n",
- "\n",
- "You can check the TFJob status and logs."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "53859cf4-7a35-4fc4-b5ee-9ba774635df0",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2022-09-12T19:10:06.862146Z",
- "iopub.status.busy": "2022-09-12T19:10:06.861177Z",
- "iopub.status.idle": "2022-09-12T19:10:06.945011Z",
- "shell.execute_reply": "2022-09-12T19:10:06.943629Z",
- "shell.execute_reply.started": "2022-09-12T19:10:06.862104Z"
- },
- "pycharm": {
- "name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "TFJob status: Succeeded\n"
- ]
- }
- ],
- "source": [
- "print(f\"TFJob status: {tfjob_client.get_job_conditions(tfjob_name, namespace=namespace, job_kind='TFJob')}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "id": "f247670e-0bd4-4336-a40c-605ce32fad23",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2022-09-12T19:10:11.765592Z",
- "iopub.status.busy": "2022-09-12T19:10:11.764384Z",
- "iopub.status.idle": "2022-09-12T19:10:14.249858Z",
- "shell.execute_reply": "2022-09-12T19:10:14.248518Z",
- "shell.execute_reply.started": "2022-09-12T19:10:11.765560Z"
- },
- "pycharm": {
- "name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO --------------------------------------------------------------------------------------\n",
- "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO Input Parameters: {'lr': '0.17016692449867332', 'num_epoch': '13', 'is_dist': True, 'num_workers': 5}\n",
- "2022-09-12T19:10:11Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:53Z INFO --------------------------------------------------------------------------------------\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:53.988515: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.008619: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> train-mnist-worker-0.kubeflow-andrey.svc:2222, 1 -> train-mnist-worker-1.kubeflow-andrey.svc:2222, 2 -> train-mnist-worker-2.kubeflow-andrey.svc:2222, 3 -> train-mnist-worker-3.kubeflow-andrey.svc:2222, 4 -> train-mnist-worker-4.kubeflow-andrey.svc:2222}\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.008700: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:272] Initialize GrpcChannelCache for job worker -> {0 -> train-mnist-worker-0.kubeflow-andrey.svc:2222, 1 -> train-mnist-worker-1.kubeflow-andrey.svc:2222, 2 -> train-mnist-worker-2.kubeflow-andrey.svc:2222, 3 -> train-mnist-worker-3.kubeflow-andrey.svc:2222, 4 -> train-mnist-worker-4.kubeflow-andrey.svc:2222}\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:54.009579: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:438] Started server with target: grpc://train-mnist-worker-0.kubeflow-andrey.svc:2222\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:4/device:CPU:0', '/job:worker/replica:0/task:2/device:CPU:0', '/job:worker/replica:0/task:3/device:CPU:0', '/job:worker/replica:0/task:1/device:CPU:0']\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO Enabled multi-worker collective ops with available devices: ['/job:worker/replica:0/task:0/device:CPU:0', '/job:worker/replica:0/task:4/device:CPU:0', '/job:worker/replica:0/task:2/device:CPU:0', '/job:worker/replica:0/task:3/device:CPU:0', '/job:worker/replica:0/task:1/device:CPU:0']\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Check health not enabled.\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO Check health not enabled.\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['train-mnist-worker-0.kubeflow-andrey.svc:2222', 'train-mnist-worker-1.kubeflow-andrey.svc:2222', 'train-mnist-worker-2.kubeflow-andrey.svc:2222', 'train-mnist-worker-3.kubeflow-andrey.svc:2222', 'train-mnist-worker-4.kubeflow-andrey.svc:2222']}, task_type = 'worker', task_id = 0, num_workers = 5, local_devices = ('/job:worker/task:0/device:CPU:0',), communication = CommunicationImplementation.RING\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:54Z INFO MultiWorkerMirroredStrategy with cluster_spec = {'worker': ['train-mnist-worker-0.kubeflow-andrey.svc:2222', 'train-mnist-worker-1.kubeflow-andrey.svc:2222', 'train-mnist-worker-2.kubeflow-andrey.svc:2222', 'train-mnist-worker-3.kubeflow-andrey.svc:2222', 'train-mnist-worker-4.kubeflow-andrey.svc:2222']}, task_type = 'worker', task_id = 0, num_workers = 5, local_devices = ('/job:worker/task:0/device:CPU:0',), communication = CommunicationImplementation.RING\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\n",
- "11490434/11490434 [==============================] - 0s 0us/step\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:56Z INFO Running Distributed Training\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:56Z INFO --------------------------------------------------------------------------------------\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:56.666389: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:776] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: \"TensorSliceDataset/_2\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: op: \"TensorSliceDataset\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: input: \"Placeholder/_0\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: input: \"Placeholder/_1\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"Toutput_types\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: list {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: type: DT_FLOAT\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: type: DT_INT64\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"_cardinality\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: i: 60000\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"is_files\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: b: false\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"metadata\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: s: \"\\n\\024TensorSliceDataset:0\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: attr {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: key: \"output_shapes\"\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: value {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: list {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: shape {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: dim {\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: size: 28\n",
- "2022-09-12T19:10:12Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: dim {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: size: 28\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: shape {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: experimental_type {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_PRODUCT\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_DATASET\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_PRODUCT\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_TENSOR\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_FLOAT\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_TENSOR\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: args {\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: type_id: TFT_INT64\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: }\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12 19:08:56.901683: W tensorflow/core/framework/dataset.cc:768] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:57Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 6 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: INFO:tensorflow:Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:13Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:08:58Z INFO Collective all_reduce tensors: 1 all_reduces, num_devices = 1, group_size = 5, implementation = CommunicationImplementation.RING, num_packs = 1\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:04Z INFO Epoch 1/13. accuracy=0.7755 - loss=0.7565\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:09Z INFO Epoch 2/13. accuracy=0.9104 - loss=0.2964\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:13Z INFO Epoch 3/13. accuracy=0.9371 - loss=0.2100\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:18Z INFO Epoch 4/13. accuracy=0.9475 - loss=0.1756\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:23Z INFO Epoch 5/13. accuracy=0.9505 - loss=0.1612\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:27Z INFO Epoch 6/13. accuracy=0.9608 - loss=0.1309\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:32Z INFO Epoch 7/13. accuracy=0.9613 - loss=0.1298\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:37Z INFO Epoch 8/13. accuracy=0.9645 - loss=0.1165\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:41Z INFO Epoch 9/13. accuracy=0.9717 - loss=0.0962\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:46Z INFO Epoch 10/13. accuracy=0.9719 - loss=0.0920\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:51Z INFO Epoch 11/13. accuracy=0.9743 - loss=0.0873\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:09:55Z INFO Epoch 12/13. accuracy=0.9751 - loss=0.0831\n",
- "2022-09-12T19:10:14Z INFO [Pod train-mnist-worker-0]: 2022-09-12T19:10:00Z INFO Epoch 13/13. accuracy=0.9765 - loss=0.0803\n"
- ]
- }
- ],
- "source": [
- "tfjob_client.get_job_logs(name=tfjob_name, namespace=namespace, is_master=True, follow=True, job_kind=\"TFJob\")"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "227c0a9a-fdf5-4047-b0e2-ec15d3c120ac",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2022-08-09T23:50:29.596391Z",
- "iopub.status.busy": "2022-08-09T23:50:29.596145Z",
- "iopub.status.idle": "2022-08-09T23:50:29.599222Z",
- "shell.execute_reply": "2022-08-09T23:50:29.598674Z",
- "shell.execute_reply.started": "2022-08-09T23:50:29.596363Z"
- },
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "## Delete Katib Experiment and TFJob\n",
- "\n",
- "When jobs are finished, you can delete the resources."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "id": "dd24acd8-4305-463e-a6e6-eed16d8a7c51",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2022-09-12T19:10:19.053646Z",
- "iopub.status.busy": "2022-09-12T19:10:19.052424Z",
- "iopub.status.idle": "2022-09-12T19:10:19.144593Z",
- "shell.execute_reply": "2022-09-12T19:10:19.143396Z",
- "shell.execute_reply.started": "2022-09-12T19:10:19.053607Z"
- },
- "pycharm": {
- "name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Experiment tune-mnist has been deleted\n"
- ]
- }
- ],
- "source": [
- "katib_client.delete_experiment(exp_name)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "id": "025fa4af-256d-4027-99ba-ba44c1409541",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2022-09-12T19:10:19.532471Z",
- "iopub.status.busy": "2022-09-12T19:10:19.531949Z",
- "iopub.status.idle": "2022-09-12T19:10:19.550331Z",
- "shell.execute_reply": "2022-09-12T19:10:19.549103Z",
- "shell.execute_reply.started": "2022-09-12T19:10:19.532441Z"
- },
- "pycharm": {
- "name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "2022-09-12T19:10:19Z INFO TFJob train-mnist has been deleted\n"
- ]
- }
- ],
- "source": [
- "tfjob_client.delete_job(tfjob_name, namespace=namespace, job_kind=\"TFJob\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "e238a638-cf77-423f-a346-f763fc8b1582",
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.9.7"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
\ No newline at end of file
diff --git a/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb b/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb
index 27de8eff0b8..72445e7390f 100644
--- a/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb
+++ b/examples/v1beta1/sdk/cmaes-and-resume-policies.ipynb
@@ -103,22 +103,41 @@
{
"cell_type": "code",
"execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "editable": true,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": [
+ "parameters"
+ ]
+ },
"outputs": [],
"source": [
"# Experiment name and namespace.\n",
"namespace = \"kubeflow\"\n",
"experiment_name = \"cmaes-example\"\n"
- ],
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {
"collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
"pycharm": {
"name": "#%%\n"
}
- }
- },
- {
- "cell_type": "code",
- "execution_count": null,
+ },
"outputs": [],
"source": [
"\n",
@@ -225,13 +244,7 @@
" trial_template=trial_template,\n",
" )\n",
")"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
+ ]
},
{
"cell_type": "code",
diff --git a/examples/v1beta1/sdk/tune-train-from-func.ipynb b/examples/v1beta1/sdk/tune-train-from-func.ipynb
index b5ab52681e5..d42cbad4312 100644
--- a/examples/v1beta1/sdk/tune-train-from-func.ipynb
+++ b/examples/v1beta1/sdk/tune-train-from-func.ipynb
@@ -62,35 +62,60 @@
{
"cell_type": "code",
"execution_count": null,
- "outputs": [],
- "source": [
- "# Experiment namespace.\n",
- "namespace = \"default\" "
- ],
+ "id": "b807dfbb",
"metadata": {
"collapsed": false,
+ "editable": true,
+ "jupyter": {
+ "outputs_hidden": false
+ },
"pycharm": {
"name": "#%%\n"
- }
- }
+ },
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": [
+ "parameters"
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "# Experiment namespace.\n",
+ "namespace = \"default\" "
+ ]
},
{
"cell_type": "markdown",
- "source": [
- "## Create Train Script for CNN Model\n",
- "\n",
- "This is simple **Convolutional Neural Network (CNN)** model for recognizing hand-written digits using [MNIST Dataset](http://yann.lecun.com/exdb/mnist/). "
- ],
+ "id": "9f319483",
"metadata": {
"collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
"pycharm": {
"name": "#%% md\n"
}
- }
+ },
+ "source": [
+ "## Create Train Script for CNN Model\n",
+ "\n",
+ "This is simple **Convolutional Neural Network (CNN)** model for recognizing hand-written digits using [MNIST Dataset](http://yann.lecun.com/exdb/mnist/). "
+ ]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "727ec914",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [],
"source": [
"def train_mnist_model(parameters):\n",
@@ -187,33 +212,41 @@
" callbacks=[CustomCallback()],\n",
" verbose=0,\n",
" )"
- ],
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4b8a6dde",
"metadata": {
"collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
"pycharm": {
- "name": "#%%\n"
+ "name": "#%% md\n"
}
- }
- },
- {
- "cell_type": "markdown",
+ },
"source": [
"## Run Training Locally in the Notebook\n",
"\n",
"We are going to download MNIST Dataset and start local training.\n",
"\n",
"Also, set `Epochs = 2` to reduce training time and avoid CPU overload. "
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%% md\n"
- }
- }
+ ]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "18e13f06",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [],
"source": [
"# Set Parameters for Local Training.\n",
@@ -226,16 +259,20 @@
"\n",
"# Train Model locally in the Notebook.\n",
"train_mnist_model(parameters)"
- ],
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e73eb9a5",
"metadata": {
"collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
"pycharm": {
- "name": "#%%\n"
+ "name": "#%% md\n"
}
- }
- },
- {
- "cell_type": "markdown",
+ },
"source": [
"## Start Model Tuning with Katib\n",
"\n",
@@ -244,17 +281,21 @@
"The following example uses **Covariance Matrix Adaptation Evolution Strategy (CMA-ES)** algorithm to tune HyperParameters.\n",
"\n",
"We are going to tune `learning rate` and `number of epochs`."
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%% md\n"
- }
- }
+ ]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "841345df",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [],
"source": [
"import kubeflow.katib as katib\n",
@@ -281,47 +322,59 @@
" max_trial_count=12, # Trial Threshold.\n",
" parallel_trial_count=2,\n",
")"
- ],
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "13fb2f5c",
"metadata": {
"collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
"pycharm": {
- "name": "#%%\n"
+ "name": "#%% md\n"
}
- }
- },
- {
- "cell_type": "markdown",
+ },
"source": [
"### Access to Katib UI\n",
"\n",
"You can check created Experiment in the Katib UI.\n",
"\n",
"![Screenshot 2022-09-12 at 20.06.23.png](attachment:cdaf463d-28b3-4a98-bb4c-9613ca1bfa50.png)"
- ],
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f24c9fd7",
"metadata": {
"collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
"pycharm": {
"name": "#%% md\n"
}
- }
- },
- {
- "cell_type": "markdown",
+ },
"source": [
"### Get the Best HyperParameters from the Katib Experiment\n",
"\n",
"You can get the best HyperParameters from the most optimal Katib Trial."
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%% md\n"
- }
- }
+ ]
},
{
"cell_type": "code",
"execution_count": null,
+ "id": "964e2f4c",
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
"outputs": [],
"source": [
"katib_client.wait_for_experiment_condition(exp_name, namespace=namespace)\n",
@@ -339,13 +392,7 @@
" best_lr = hp.value\n",
" else:\n",
" best_num_epoch = hp.value"
- ],
- "metadata": {
- "collapsed": false,
- "pycharm": {
- "name": "#%%\n"
- }
- }
+ ]
},
{
"cell_type": "code",