From 512be3b2140f29187a766a2ead2ea53773c1d796 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 11 Dec 2024 21:26:57 +0000 Subject: [PATCH] update and simplify HPO example --- .gitignore | 3 + source/examples/rapids-azureml-hpo/Dockerfile | 10 - .../rapids-azureml-hpo/notebook.ipynb | 246 +++++++----------- .../examples/rapids-optuna-hpo/notebook.ipynb | 2 +- 4 files changed, 96 insertions(+), 165 deletions(-) delete mode 100644 source/examples/rapids-azureml-hpo/Dockerfile diff --git a/.gitignore b/.gitignore index e6ad9798..dd5dfc89 100644 --- a/.gitignore +++ b/.gitignore @@ -23,5 +23,8 @@ cufile.log node_modules/ jupyter_execute/ +# files manually written by example code +source/examples/rapids-azureml-hpo/Dockerfile + # exclusions !source/examples/rapids-1brc-single-node/lookup.csv diff --git a/source/examples/rapids-azureml-hpo/Dockerfile b/source/examples/rapids-azureml-hpo/Dockerfile deleted file mode 100644 index bb90d5a1..00000000 --- a/source/examples/rapids-azureml-hpo/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -# Use rapids base image v23.02 with the necessary dependencies -FROM rapidsai/rapidsai:23.02-cuda11.8-runtime-ubuntu22.04-py3.10 - -# Update package information and install required packages -RUN apt-get update && \ - apt-get install -y --no-install-recommends build-essential fuse && \ - rm -rf /var/lib/apt/lists/* - -# Activate rapids conda environment -RUN /bin/bash -c "source activate rapids && pip install azureml-mlflow azureml-dataprep" diff --git a/source/examples/rapids-azureml-hpo/notebook.ipynb b/source/examples/rapids-azureml-hpo/notebook.ipynb index f7065d27..b98109df 100644 --- a/source/examples/rapids-azureml-hpo/notebook.ipynb +++ b/source/examples/rapids-azureml-hpo/notebook.ipynb @@ -34,7 +34,9 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "# Initialize Workspace" ] @@ -52,18 +54,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Workspace name: rapids-aml-cluster\n", - "Subscription id: fc4f4a6b-4041-4b1c-8249-854d68edcf62\n", - "Resource group: rapidsai-deployment\n" - ] - } - ], + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "from azure.ai.ml import MLClient\n", "from azure.identity import DefaultAzureCredential\n", @@ -99,18 +93,11 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data uri: \n", - " azureml://subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourcegroups/rapidsai-deployment/workspaces/rapids-aml-cluster/datastores/workspaceartifactstore/paths/airline_20000000.parquet\n" - ] - } - ], + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "datastore_name = \"workspaceartifactstore\"\n", "dataset = \"airline_20000000.parquet\"\n", @@ -148,33 +135,27 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "found compute target. Will use rapids-cluster\n" - ] - } - ], + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "from azure.ai.ml.entities import AmlCompute\n", "from azure.ai.ml.exceptions import MlException\n", "\n", "# specify aml compute name.\n", - "gpu_compute_target = \"rapids-cluster\"\n", + "target_name = \"rapids-cluster\"\n", "\n", "try:\n", " # let's see if the compute target already exists\n", - " gpu_target = ml_client.compute.get(gpu_compute_target)\n", - " print(f\"found compute target. Will use {gpu_compute_target}\")\n", + " gpu_target = ml_client.compute.get(target_name)\n", + " print(f\"found compute target. Will use {gpu_target.name}\")\n", "except MlException:\n", " print(\"Creating a new gpu compute target...\")\n", "\n", " gpu_target = AmlCompute(\n", - " name=\"rapids-cluster\",\n", + " name=target_name,\n", " type=\"amlcompute\",\n", " size=\"STANDARD_NC12S_V3\",\n", " max_instances=5,\n", @@ -223,16 +204,6 @@ "These run metrics will become particularly important when we begin hyperparameter tuning our model in the 'Tune model hyperparameters' section." ] }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "rapids_script = \"./train_rapids.py\"\n", - "azure_script = \"./rapids_csp_azure.py\"" - ] - }, { "cell_type": "markdown", "metadata": { @@ -246,70 +217,58 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create Experiment\n", - "\n", - "Track all the runs in your workspace" + "## Setup Environment" ] }, { - "cell_type": "code", - "execution_count": 6, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "experiment_name = \"test_rapids_aml_cluster\"" + "We'll be using a custom RAPIDS docker image to [setup the environment](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-environments-v2?tabs=python#create-an-environment-from-a-docker-image). This is available in `rapidsai/base` repo on [DockerHub](https://hub.docker.com/r/rapidsai/base/)." ] }, { - "cell_type": "markdown", - "metadata": {}, + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "## Setup Environment" + "%%bash\n", + "# create a Dockerfile defining the image the code will run in\n", + "cat > ./Dockerfile <=2024.4.4' \\\n", + " && pip install azureml-mlflow\n", + "EOF" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ - "We'll be using a custom RAPIDS docker image to [setup the environment](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-environments-v2?tabs=python#create-an-environment-from-a-docker-image). This is available in `rapidsai/rapidsai` repo on [DockerHub](https://hub.docker.com/r/rapidsai/rapidsai/).\n", - "\n", - "Make sure you have the correct path to the docker build context as `os.getcwd()`," + "Make sure you have the correct path to the docker build context as `os.getcwd()`." ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32mUploading code (0.33 MBs): 100%|██████████| 325450/325450 [00:00<00:00, 2363322.62it/s]\n", - "\u001b[39m\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "Environment({'intellectual_property': None, 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'rapids-mlflow', 'description': 'RAPIDS environment with azureml-mlflow', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourceGroups/rapidsai-deployment/providers/Microsoft.MachineLearningServices/workspaces/rapids-aml-cluster/environments/rapids-mlflow/versions/10', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/skirui1/code', 'creation_context': , 'serialize': , 'version': '10', 'latest_version': None, 'conda_file': None, 'image': None, 'build': , 'inference_config': None, 'os_type': 'Linux', 'arm_type': 'environment_version', 'conda_file_path': None, 'path': None, 'datastore': None, 'upload_hash': None, 'translated_conda_file': None})" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# RUN THIS CODE ONCE TO SETUP ENVIRONMENT\n", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ "import os\n", "\n", "from azure.ai.ml.entities import BuildContext, Environment\n", "\n", "env_docker_image = Environment(\n", " build=BuildContext(path=os.getcwd()),\n", - " name=\"rapids-mlflow\",\n", + " name=\"rapids-hpo\",\n", " description=\"RAPIDS environment with azureml-mlflow\",\n", ")\n", "\n", @@ -340,39 +299,13 @@ "library/cudf" ] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "\u001b[32mUploading code (0.33 MBs): 100%|██████████| 327210/327210 [00:00<00:00, 1802654.05it/s]\n", - "\u001b[39m\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "'https://ml.azure.com/runs/zen_eye_lm7dcp68jz?wsid=/subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourcegroups/rapidsai-deployment/workspaces/rapids-aml-cluster&tid=43083d15-7273-40c1-b7db-39efd9ccc17a'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from azure.ai.ml import Input, command\n", "\n", "command_job = command(\n", " environment=f\"{env_docker_image.name}:{env_docker_image.version}\",\n", - " experiment_name=experiment_name,\n", + " experiment_name=\"test_rapids_aml_hpo_cluster\",\n", " code=os.getcwd(),\n", " inputs={\n", " \"data_dir\": Input(type=\"uri_file\", path=data_uri),\n", @@ -383,12 +316,15 @@ " \"max_depth\": 6,\n", " \"max_features\": 0.3,\n", " },\n", - " command=(\n", - " \"python train_rapids.py --data_dir ${{inputs.data_dir}} --n_bins ${{inputs.n_bins}} \"\n", - " \"--compute ${{inputs.compute}} --cv_folds ${{inputs.cv_folds}} --n_estimators ${{inputs.n_estimators}} \"\n", - " \"--max_depth ${{inputs.max_depth}} --max_features ${{inputs.max_features}}\"\n", - " ),\n", - " compute=\"rapids-cluster\",\n", + " command=\"python train_rapids.py \\\n", + " --data_dir ${{inputs.data_dir}} \\\n", + " --n_bins ${{inputs.n_bins}} \\\n", + " --compute ${{inputs.compute}} \\\n", + " --cv_folds ${{inputs.cv_folds}} \\\n", + " --n_estimators ${{inputs.n_estimators}} \\\n", + " --max_depth ${{inputs.max_depth}} \\\n", + " --max_features ${{inputs.max_features}}\",\n", + " compute=gpu_target.name,\n", ")\n", "\n", "\n", @@ -430,7 +366,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from azure.ai.ml.sweep import Choice, Uniform\n", @@ -443,19 +381,21 @@ "\n", "# apply sweep parameter to obtain the sweep_job\n", "sweep_job = command_job_for_sweep.sweep(\n", - " compute=\"rapids-cluster\",\n", + " compute=gpu_target.name,\n", " sampling_algorithm=\"random\",\n", " primary_metric=\"Accuracy\",\n", " goal=\"Maximize\",\n", ")\n", "\n", "\n", - "# Define the limits for this sweep\n", + "# Relax these limits to run more trials\n", "sweep_job.set_limits(\n", - " max_total_trials=10, max_concurrent_trials=2, timeout=18000, trial_timeout=3600\n", + " max_total_trials=5,\n", + " max_concurrent_trials=5,\n", + " timeout=18000,\n", + " trial_timeout=3600\n", ")\n", "\n", - "\n", "# Specify your experiment details\n", "sweep_job.display_name = \"RF-rapids-sweep-job\"\n", "sweep_job.description = \"Run RAPIDS hyperparameter sweep job\"" @@ -470,8 +410,10 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, + "execution_count": null, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# submit the hpo job\n", @@ -482,26 +424,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Monitor SweepJobs runs" + "## Monitor runs" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Monitor your job at https://ml.azure.com/runs/eager_turtle_r7fs2xzcty?wsid=/subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourcegroups/rapidsai-deployment/workspaces/rapids-aml-cluster&tid=43083d15-7273-40c1-b7db-39efd9ccc17a\n" - ] - } - ], + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "aml_url = returned_sweep_job.studio_url\n", - "\n", - "print(\"Monitor your job at\", aml_url)" + "print(f\"Monitor your job at {returned_sweep_job.studio_url}\")" ] }, { @@ -520,8 +454,10 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, + "execution_count": null, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "ml_client.jobs.download(returned_sweep_job.name, output_name=\"model\")" @@ -536,11 +472,13 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, + "execution_count": null, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "ml_client.compute.begin_delete(gpu_compute_target).wait()" + "ml_client.compute.begin_delete(gpu_target.name).wait()" ] } ], @@ -549,9 +487,9 @@ "name": "rapids" }, "kernelspec": { - "display_name": "rapids-23.06", + "display_name": "rapids", "language": "python", - "name": "rapids-23.06" + "name": "rapids" }, "language_info": { "codemirror_mode": { @@ -563,7 +501,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.8" }, "microsoft": { "ms_spell_check": { diff --git a/source/examples/rapids-optuna-hpo/notebook.ipynb b/source/examples/rapids-optuna-hpo/notebook.ipynb index 3f16ccf3..79c181a2 100644 --- a/source/examples/rapids-optuna-hpo/notebook.ipynb +++ b/source/examples/rapids-optuna-hpo/notebook.ipynb @@ -380,7 +380,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.11" } }, "nbformat": 4,