From 512be3b2140f29187a766a2ead2ea53773c1d796 Mon Sep 17 00:00:00 2001
From: Ubuntu <jaylamb20@gmail.com>
Date: Wed, 11 Dec 2024 21:26:57 +0000
Subject: [PATCH] update and simplify HPO example

---
 .gitignore                                    |   3 +
 source/examples/rapids-azureml-hpo/Dockerfile |  10 -
 .../rapids-azureml-hpo/notebook.ipynb         | 246 +++++++-----------
 .../examples/rapids-optuna-hpo/notebook.ipynb |   2 +-
 4 files changed, 96 insertions(+), 165 deletions(-)
 delete mode 100644 source/examples/rapids-azureml-hpo/Dockerfile

diff --git a/.gitignore b/.gitignore
index e6ad9798..dd5dfc89 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,5 +23,8 @@ cufile.log
 node_modules/
 jupyter_execute/
 
+# files manually written by example code
+source/examples/rapids-azureml-hpo/Dockerfile
+
 # exclusions
 !source/examples/rapids-1brc-single-node/lookup.csv
diff --git a/source/examples/rapids-azureml-hpo/Dockerfile b/source/examples/rapids-azureml-hpo/Dockerfile
deleted file mode 100644
index bb90d5a1..00000000
--- a/source/examples/rapids-azureml-hpo/Dockerfile
+++ /dev/null
@@ -1,10 +0,0 @@
-# Use rapids base image v23.02 with the necessary dependencies
-FROM rapidsai/rapidsai:23.02-cuda11.8-runtime-ubuntu22.04-py3.10
-
-# Update package information and install required packages
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends build-essential fuse && \
-    rm -rf /var/lib/apt/lists/*
-
-# Activate rapids conda environment
-RUN /bin/bash -c "source activate rapids && pip install azureml-mlflow azureml-dataprep"
diff --git a/source/examples/rapids-azureml-hpo/notebook.ipynb b/source/examples/rapids-azureml-hpo/notebook.ipynb
index f7065d27..b98109df 100644
--- a/source/examples/rapids-azureml-hpo/notebook.ipynb
+++ b/source/examples/rapids-azureml-hpo/notebook.ipynb
@@ -34,7 +34,9 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "source": [
     "# Initialize Workspace"
    ]
@@ -52,18 +54,10 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Workspace name: rapids-aml-cluster\n",
-      "Subscription id: fc4f4a6b-4041-4b1c-8249-854d68edcf62\n",
-      "Resource group: rapidsai-deployment\n"
-     ]
-    }
-   ],
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "from azure.ai.ml import MLClient\n",
     "from azure.identity import DefaultAzureCredential\n",
@@ -99,18 +93,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "data uri: \n",
-      " azureml://subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourcegroups/rapidsai-deployment/workspaces/rapids-aml-cluster/datastores/workspaceartifactstore/paths/airline_20000000.parquet\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "datastore_name = \"workspaceartifactstore\"\n",
     "dataset = \"airline_20000000.parquet\"\n",
@@ -148,33 +135,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "found compute target. Will use rapids-cluster\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
     "from azure.ai.ml.entities import AmlCompute\n",
     "from azure.ai.ml.exceptions import MlException\n",
     "\n",
     "# specify aml compute name.\n",
-    "gpu_compute_target = \"rapids-cluster\"\n",
+    "target_name = \"rapids-cluster\"\n",
     "\n",
     "try:\n",
     "    # let's see if the compute target already exists\n",
-    "    gpu_target = ml_client.compute.get(gpu_compute_target)\n",
-    "    print(f\"found compute target. Will use {gpu_compute_target}\")\n",
+    "    gpu_target = ml_client.compute.get(target_name)\n",
+    "    print(f\"found compute target. Will use {gpu_target.name}\")\n",
     "except MlException:\n",
     "    print(\"Creating a new gpu compute target...\")\n",
     "\n",
     "    gpu_target = AmlCompute(\n",
-    "        name=\"rapids-cluster\",\n",
+    "        name=target_name,\n",
     "        type=\"amlcompute\",\n",
     "        size=\"STANDARD_NC12S_V3\",\n",
     "        max_instances=5,\n",
@@ -223,16 +204,6 @@
     "These run metrics will become particularly important when we begin hyperparameter tuning our model in the 'Tune model hyperparameters' section."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "rapids_script = \"./train_rapids.py\"\n",
-    "azure_script = \"./rapids_csp_azure.py\""
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -246,70 +217,58 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Create Experiment\n",
-    "\n",
-    "Track all the runs in your workspace"
+    "## Setup Environment"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 6,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "experiment_name = \"test_rapids_aml_cluster\""
+    "We'll be using a custom RAPIDS docker image to [setup the environment](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-environments-v2?tabs=python#create-an-environment-from-a-docker-image). This is available in `rapidsai/base` repo on [DockerHub](https://hub.docker.com/r/rapidsai/base/)."
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "## Setup Environment"
+    "%%bash\n",
+    "# create a Dockerfile defining the image the code will run in\n",
+    "cat > ./Dockerfile <<EOF\n",
+    "FROM {{ rapids_container }}\n",
+    "\n",
+    "RUN conda install --yes -c conda-forge 'dask-ml>=2024.4.4' \\\n",
+    " && pip install azureml-mlflow\n",
+    "EOF"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "source": [
-    "We'll be using a custom RAPIDS docker image to [setup the environment](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-environments-v2?tabs=python#create-an-environment-from-a-docker-image). This is available in `rapidsai/rapidsai` repo on [DockerHub](https://hub.docker.com/r/rapidsai/rapidsai/).\n",
-    "\n",
-    "Make sure you have the correct path to the docker build context as `os.getcwd()`,"
+    "Make sure you have the correct path to the docker build context as `os.getcwd()`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\u001b[32mUploading code (0.33 MBs): 100%|██████████| 325450/325450 [00:00<00:00, 2363322.62it/s]\n",
-      "\u001b[39m\n",
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "Environment({'intellectual_property': None, 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'rapids-mlflow', 'description': 'RAPIDS environment with azureml-mlflow', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourceGroups/rapidsai-deployment/providers/Microsoft.MachineLearningServices/workspaces/rapids-aml-cluster/environments/rapids-mlflow/versions/10', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/skirui1/code', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7f9ce47101f0>, 'serialize': <msrest.serialization.Serializer object at 0x7f9ce4710d30>, 'version': '10', 'latest_version': None, 'conda_file': None, 'image': None, 'build': <azure.ai.ml.entities._assets.environment.BuildContext object at 0x7f9ce4713580>, 'inference_config': None, 'os_type': 'Linux', 'arm_type': 'environment_version', 'conda_file_path': None, 'path': None, 'datastore': None, 'upload_hash': None, 'translated_conda_file': None})"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# RUN THIS CODE ONCE TO SETUP ENVIRONMENT\n",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
     "import os\n",
     "\n",
     "from azure.ai.ml.entities import BuildContext, Environment\n",
     "\n",
     "env_docker_image = Environment(\n",
     "    build=BuildContext(path=os.getcwd()),\n",
-    "    name=\"rapids-mlflow\",\n",
+    "    name=\"rapids-hpo\",\n",
     "    description=\"RAPIDS environment with azureml-mlflow\",\n",
     ")\n",
     "\n",
@@ -340,39 +299,13 @@
      "library/cudf"
     ]
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n",
-      "Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n",
-      "Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n",
-      "Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n",
-      "Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n",
-      "Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n",
-      "\u001b[32mUploading code (0.33 MBs): 100%|██████████| 327210/327210 [00:00<00:00, 1802654.05it/s]\n",
-      "\u001b[39m\n",
-      "\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "'https://ml.azure.com/runs/zen_eye_lm7dcp68jz?wsid=/subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourcegroups/rapidsai-deployment/workspaces/rapids-aml-cluster&tid=43083d15-7273-40c1-b7db-39efd9ccc17a'"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from azure.ai.ml import Input, command\n",
     "\n",
     "command_job = command(\n",
     "    environment=f\"{env_docker_image.name}:{env_docker_image.version}\",\n",
-    "    experiment_name=experiment_name,\n",
+    "    experiment_name=\"test_rapids_aml_hpo_cluster\",\n",
     "    code=os.getcwd(),\n",
     "    inputs={\n",
     "        \"data_dir\": Input(type=\"uri_file\", path=data_uri),\n",
@@ -383,12 +316,15 @@
     "        \"max_depth\": 6,\n",
     "        \"max_features\": 0.3,\n",
     "    },\n",
-    "    command=(\n",
-    "        \"python train_rapids.py --data_dir ${{inputs.data_dir}} --n_bins ${{inputs.n_bins}} \"\n",
-    "        \"--compute ${{inputs.compute}} --cv_folds ${{inputs.cv_folds}} --n_estimators ${{inputs.n_estimators}} \"\n",
-    "        \"--max_depth ${{inputs.max_depth}}  --max_features ${{inputs.max_features}}\"\n",
-    "    ),\n",
-    "    compute=\"rapids-cluster\",\n",
+    "    command=\"python train_rapids.py \\\n",
+    "                    --data_dir ${{inputs.data_dir}} \\\n",
+    "                    --n_bins ${{inputs.n_bins}} \\\n",
+    "                    --compute ${{inputs.compute}} \\\n",
+    "                    --cv_folds ${{inputs.cv_folds}} \\\n",
+    "                    --n_estimators ${{inputs.n_estimators}} \\\n",
+    "                    --max_depth ${{inputs.max_depth}} \\\n",
+    "                    --max_features ${{inputs.max_features}}\",\n",
+    "    compute=gpu_target.name,\n",
     ")\n",
     "\n",
     "\n",
@@ -430,7 +366,9 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "from azure.ai.ml.sweep import Choice, Uniform\n",
@@ -443,19 +381,21 @@
     "\n",
     "# apply sweep parameter to obtain the sweep_job\n",
     "sweep_job = command_job_for_sweep.sweep(\n",
-    "    compute=\"rapids-cluster\",\n",
+    "    compute=gpu_target.name,\n",
     "    sampling_algorithm=\"random\",\n",
     "    primary_metric=\"Accuracy\",\n",
     "    goal=\"Maximize\",\n",
     ")\n",
     "\n",
     "\n",
-    "# Define the limits for this sweep\n",
+    "# Relax these limits to run more trials\n",
     "sweep_job.set_limits(\n",
-    "    max_total_trials=10, max_concurrent_trials=2, timeout=18000, trial_timeout=3600\n",
+    "    max_total_trials=5,\n",
+    "    max_concurrent_trials=5,\n",
+    "    timeout=18000,\n",
+    "    trial_timeout=3600\n",
     ")\n",
     "\n",
-    "\n",
     "# Specify your experiment details\n",
     "sweep_job.display_name = \"RF-rapids-sweep-job\"\n",
     "sweep_job.description = \"Run RAPIDS hyperparameter sweep job\""
@@ -470,8 +410,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "# submit the hpo job\n",
@@ -482,26 +424,18 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Monitor SweepJobs runs"
+    "## Monitor runs"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Monitor your job at https://ml.azure.com/runs/eager_turtle_r7fs2xzcty?wsid=/subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourcegroups/rapidsai-deployment/workspaces/rapids-aml-cluster&tid=43083d15-7273-40c1-b7db-39efd9ccc17a\n"
-     ]
-    }
-   ],
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "aml_url = returned_sweep_job.studio_url\n",
-    "\n",
-    "print(\"Monitor your job at\", aml_url)"
+    "print(f\"Monitor your job at {returned_sweep_job.studio_url}\")"
    ]
   },
   {
@@ -520,8 +454,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
     "ml_client.jobs.download(returned_sweep_job.name, output_name=\"model\")"
@@ -536,11 +472,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
-    "ml_client.compute.begin_delete(gpu_compute_target).wait()"
+    "ml_client.compute.begin_delete(gpu_target.name).wait()"
    ]
   }
  ],
@@ -549,9 +487,9 @@
    "name": "rapids"
   },
   "kernelspec": {
-   "display_name": "rapids-23.06",
+   "display_name": "rapids",
    "language": "python",
-   "name": "rapids-23.06"
+   "name": "rapids"
   },
   "language_info": {
    "codemirror_mode": {
@@ -563,7 +501,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.8"
   },
   "microsoft": {
    "ms_spell_check": {
diff --git a/source/examples/rapids-optuna-hpo/notebook.ipynb b/source/examples/rapids-optuna-hpo/notebook.ipynb
index 3f16ccf3..79c181a2 100644
--- a/source/examples/rapids-optuna-hpo/notebook.ipynb
+++ b/source/examples/rapids-optuna-hpo/notebook.ipynb
@@ -380,7 +380,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,