diff --git a/.gitignore b/.gitignore index e6ad9798..dd5dfc89 100644 --- a/.gitignore +++ b/.gitignore @@ -23,5 +23,8 @@ cufile.log node_modules/ jupyter_execute/ +# files manually written by example code +source/examples/rapids-azureml-hpo/Dockerfile + # exclusions !source/examples/rapids-1brc-single-node/lookup.csv diff --git a/pyproject.toml b/pyproject.toml index 8d639119..a8a57499 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,8 @@ select = [ "F", # isort "I", + # numpy + "NPY", # pyupgrade "UP", # flake8-bugbear diff --git a/source/cloud/azure/azureml.md b/source/cloud/azure/azureml.md index ffe502ac..96ccc396 100644 --- a/source/cloud/azure/azureml.md +++ b/source/cloud/azure/azureml.md @@ -4,7 +4,7 @@ review_priority: "p0" # Azure Machine Learning -RAPIDS can be deployed at scale using [Azure Machine Learning Service](https://learn.microsoft.com/en-us/azure/machine-learning/overview-what-is-azure-machine-learning) and easily scales up to any size needed. +RAPIDS can be deployed at scale using [Azure Machine Learning Service](https://learn.microsoft.com/en-us/azure/machine-learning/overview-what-is-azure-machine-learning) and can be scaled up to any size needed. ## Pre-requisites @@ -16,52 +16,55 @@ Follow these high-level steps to get started: **2. Workspace.** Within the Resource Group, create an Azure Machine Learning service Workspace. -**3. Config.** Within the Workspace, download the `config.json` file, as you will load the details to initialize workspace for running ML training jobs from within your notebook. - -![Screenshot of download config file](../../images/azureml-download-config-file.png) - -**4. Quota.** Check your Usage + Quota to ensure you have enough quota within your region to launch your desired cluster size. +**3. Quota.** Check your Usage + Quota to ensure you have enough quota within your region to launch your desired cluster size. ## Azure ML Compute instance Although it is possible to install Azure Machine Learning on your local computer, it is recommended to utilize [Azure's ML Compute instances](https://learn.microsoft.com/en-us/azure/machine-learning/concept-compute-instance), fully managed and secure development environments that can also serve as a [compute target](https://learn.microsoft.com/en-us/azure/machine-learning/concept-compute-target?view=azureml-api-2) for ML training. -The compute instance provides an integrated Jupyter notebook service, JupyterLab, Azure ML Python SDK, CLI, and other essential [tools](https://learn.microsoft.com/en-us/azure/machine-learning/concept-compute-target?view=azureml-api-2). +The compute instance provides an integrated Jupyter notebook service, JupyterLab, Azure ML Python SDK, CLI, and other essential tools. ### Select your instance Sign in to [Azure Machine Learning Studio](https://ml.azure.com/) and navigate to your workspace on the left-side menu. -Select **Compute** > **+ New** (Create compute instance) > choose a [RAPIDS compatible GPU](https://medium.com/dropout-analytics/which-gpus-work-with-rapids-ai-f562ef29c75f) VM size (e.g., `Standard_NC12s_v3`) +Select **New** > **Compute instance** (Create compute instance) > choose a [RAPIDS compatible GPU](https://docs.rapids.ai/install/#system-req) VM size (e.g., `Standard_NC12s_v3`) ![Screenshot of create new notebook with a gpu-instance](../../images/azureml-create-notebook-instance.png) ### Provision RAPIDS setup script -Navigate to the **Applications** section and choose "Provision with a startup script" to install RAPIDS and dependencies. You can upload the script from your Notebooks files or local computer. - -Optional to enable SSH access to your compute (if needed). - -![Screenshot of the provision setup script screen](../../images/azureml-provision-setup-script.png) +Navigate to the **Applications** section. +Choose "Provision with a creation script" to install RAPIDS and dependencies. -Refer to [Azure ML documentation](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-customize-compute-instance) for more details on how to create the setup script but it should resemble: +Put the following in a local file called `rapids-azure-startup.sh`: ```bash #!/bin/bash sudo -u azureuser -i <<'EOF' +source /anaconda/etc/profile.d/conda.sh +conda create -y -n rapids \ + {{ rapids_conda_channels }} \ + -c microsoft \ + {{ rapids_conda_packages }} \ + 'azure-ai-ml>=2024.12' \ + 'azure-identity>=24.12' \ + ipykernel -conda create -y -n rapids {{ rapids_conda_channels }} {{ rapids_conda_packages }} ipykernel conda activate rapids -# install Python SDK v2 in rapids env -python -m pip install azure-ai-ml azure-identity - python -m ipykernel install --user --name rapids echo "kernel install completed" EOF ``` +Select `local file`, then `Browse`, and upload that script. + +![Screenshot of the provision setup script screen](../../images/azureml-provision-setup-script.png) + +Refer to [Azure ML documentation](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-customize-compute-instance) for more details on how to create the setup script. + Launch the instance. ### Select the RAPIDS environment @@ -76,30 +79,32 @@ The Compute cluster scales up automatically when a job is submitted, and execute ### Instantiate workspace -If using the Python SDK, connect to your workspace either by explicitly providing the workspace details or load from the `config.json` file downloaded in the pre-requisites section. +Use Azure's client libraries to set up some resources. ```python from azure.ai.ml import MLClient from azure.identity import DefaultAzureCredential -# Get a handle to the workspace -ml_client = MLClient( - credential=DefaultAzureCredential(), - subscription_id="", - resource_group_name="", - workspace_name="", -) - -# or load details from config file +# Get a handle to the workspace. +# +# Azure ML places the workspace config at the default working +# directory for notebooks by default. +# +# If it isn't found, open a shell and look in the +# directory indicated by 'echo ${JUPYTER_SERVER_ROOT}'. ml_client = MLClient.from_config( credential=DefaultAzureCredential(), - path="config.json", + path="./config.json", ) ``` ### Create AMLCompute -You will need to create a [compute target](https://learn.microsoft.com/en-us/azure/machine-learning/concept-compute-target?view=azureml-api-2#azure-machine-learning-compute-managed) using Azure ML managed compute ([AmlCompute](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-ml/0.1.0b4/azure.ai.ml.entities.html)) for remote training. Note: Be sure to check limits within your available region. This [article](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-quotas?view=azureml-api-2#azure-machine-learning-compute) includes details on the default limits and how to request more quota. +You will need to create a [compute target](https://learn.microsoft.com/en-us/azure/machine-learning/concept-compute-target?view=azureml-api-2#azure-machine-learning-compute-managed) using Azure ML managed compute ([AmlCompute](https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-ml/0.1.0b4/azure.ai.ml.entities.html)) for remote training. + +Note: Be sure to check limits within your available region. + +This [article](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-quotas?view=azureml-api-2#azure-machine-learning-compute) includes details on the default limits and how to request more quota. [**size**]: The VM family of the nodes. Specify from one of **NC_v2**, **NC_v3**, **ND** or **ND_v2** GPU virtual machines (e.g `Standard_NC12s_v3`) @@ -142,17 +147,11 @@ You can define an environment from a [pre-built](https://learn.microsoft.com/en- Create your custom RAPIDS docker image using the example below, making sure to install additional packages needed for your workflows. ```dockerfile - # Use latest rapids image with the necessary dependencies FROM {{ rapids_container }} -# Update and/or install required packages -RUN apt-get update && \ - apt-get install -y --no-install-recommends build-essential fuse && \ - rm -rf /var/lib/apt/lists/* - -# Activate rapids conda environment -RUN /bin/bash -c "source activate rapids && pip install azureml-mlflow" +RUN conda install --yes -c conda-forge 'dask-ml>=2024.4.4' \ + && pip install azureml-mlflow ``` Now create the Environment, making sure to label and provide a description: @@ -160,8 +159,9 @@ Now create the Environment, making sure to label and provide a description: ```python from azure.ai.ml.entities import Environment, BuildContext +# NOTE: 'path' should be a filepath pointing to a directory containing a file named 'Dockerfile' env_docker_image = Environment( - build=BuildContext(path="Dockerfile"), + build=BuildContext(path="./training-code/"), name="rapids-mlflow", description="RAPIDS environment with azureml-mlflow", ) @@ -171,17 +171,45 @@ ml_client.environments.create_or_update(env_docker_image) ### Submit RAPIDS Training jobs -Now that we have our environment and custom logic, we can configure and run the `command` [class](https://learn.microsoft.com/en-us/python/api/azure-ai-ml/azure.ai.ml?view=azure-python#azure-ai-ml-command) to submit training jobs. `inputs` is a dictionary of command-line arguments to pass to the training script. +Now that we have our environment and custom logic, we can configure and run the `command` [class](https://learn.microsoft.com/en-us/python/api/azure-ai-ml/azure.ai.ml?view=azure-python#azure-ai-ml-command) to submit training jobs. + +In a notebook cell, copy the example code from this documentation into a new folder. + +```ipython +%%bash +mkdir -p ./training-code +repo_url='https://raw.githubusercontent.com/rapidsai/deployment/refs/heads/main/source/examples' + +# download training scripts +wget -O ./training-code/train_rapids.py "${repo_url}/rapids-azureml-hpo/train_rapids.py" +wget -O ./training-code/rapids_csp_azure.py "${repo_url}/rapids-azureml-hpo/rapids_csp_azure.py" +touch ./training-code/__init__.py + +# create a Dockerfile defining the image the code will run in +cat > ./training-code/Dockerfile <=2024.4.4' \ + && pip install azureml-mlflow +EOF +``` + +`inputs` is a dictionary of command-line arguments to pass to the training script. ```python from azure.ai.ml import command, Input -from azure.ai.ml.sweep import Choice, Uniform + +# replace this with your own dataset +datastore_name = "workspaceartifactstore" +dataset = "airline_20000000.parquet" +data_uri = f"azureml://subscriptions/{ml_client.subscription_id}/resourcegroups/{ml_client.resource_group_name}/workspaces/{ml_client.workspace_name}/datastores/{datastore_name}/paths/{dataset}" command_job = command( - environment="rapids-mlflow:1", # specify version of environment to use + environment=f"{env_docker_image.name}:{env_docker_image.version}", experiment_name="test_rapids_mlflow", - code=project_folder, - command="python train_rapids.py --data_dir ${{inputs.data_dir}} \ + code="./training-code", + command="python train_rapids.py \ + --data_dir ${{inputs.data_dir}} \ --n_bins ${{inputs.n_bins}} \ --cv_folds ${{inputs.cv_folds}} \ --n_estimators ${{inputs.n_estimators}} \ @@ -195,11 +223,19 @@ command_job = command( "max_depth": 10, "max_features": 1.0, }, - compute="rapids-cluster", + compute=gpu_compute.name, ) -returned_job = ml_client.jobs.create_or_update(command_job) # submit training job +# submit training job +returned_job = ml_client.jobs.create_or_update(command_job) +``` + +After creating the job, go to [the "Experiments" page](https://ml.azure.com/experiments) to view logs, metrics, and outputs. +Next, try performing a sweep over a set of hyperparameters. + +```python +from azure.ai.ml.sweep import Choice, Uniform # define hyperparameter space to sweep over command_job_for_sweep = command_job( @@ -210,19 +246,21 @@ command_job_for_sweep = command_job( # apply hyperparameter sweep_job sweep_job = command_job_for_sweep.sweep( - compute="rapids-cluster", + compute=gpu_compute.name, sampling_algorithm="random", primary_metric="Accuracy", goal="Maximize", ) -returned_sweep_job = ml_client.create_or_update(sweep_job) # submit hpo job +# submit job +returned_sweep_job = ml_client.create_or_update(sweep_job) ``` -### CleanUp +### Clean Up + +When you're done, remove the compute resources. ```python -# Delete compute cluster ml_client.compute.begin_delete(gpu_compute.name).wait() ``` diff --git a/source/examples/rapids-azureml-hpo/Dockerfile b/source/examples/rapids-azureml-hpo/Dockerfile deleted file mode 100644 index bb90d5a1..00000000 --- a/source/examples/rapids-azureml-hpo/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -# Use rapids base image v23.02 with the necessary dependencies -FROM rapidsai/rapidsai:23.02-cuda11.8-runtime-ubuntu22.04-py3.10 - -# Update package information and install required packages -RUN apt-get update && \ - apt-get install -y --no-install-recommends build-essential fuse && \ - rm -rf /var/lib/apt/lists/* - -# Activate rapids conda environment -RUN /bin/bash -c "source activate rapids && pip install azureml-mlflow azureml-dataprep" diff --git a/source/examples/rapids-azureml-hpo/notebook.ipynb b/source/examples/rapids-azureml-hpo/notebook.ipynb index d6f6736e..3c6337ae 100644 --- a/source/examples/rapids-azureml-hpo/notebook.ipynb +++ b/source/examples/rapids-azureml-hpo/notebook.ipynb @@ -32,38 +32,11 @@ "````" ] }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Name: azure-ai-ml\n", - "Version: 1.8.0\n", - "Summary: Microsoft Azure Machine Learning Client Library for Python\n", - "Home-page: https://github.com/Azure/azure-sdk-for-python\n", - "Author: Microsoft Corporation\n", - "Author-email: azuresdkengsysadmins@microsoft.com\n", - "License: MIT License\n", - "Location: /anaconda/envs/rapids/lib/python3.10/site-packages\n", - "Requires: azure-common, azure-core, azure-mgmt-core, azure-storage-blob, azure-storage-file-datalake, azure-storage-file-share, colorama, isodate, jsonschema, marshmallow, msrest, opencensus-ext-azure, pydash, pyjwt, pyyaml, strictyaml, tqdm, typing-extensions\n", - "Required-by: \n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "# verify Azure ML SDK version\n", - "\n", - "%pip show azure-ai-ml" - ] - }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "# Initialize Workspace" ] @@ -80,40 +53,25 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Workspace name: rapids-aml-cluster\n", - "Subscription id: fc4f4a6b-4041-4b1c-8249-854d68edcf62\n", - "Resource group: rapidsai-deployment\n" - ] - } - ], + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "from azure.ai.ml import MLClient\n", "from azure.identity import DefaultAzureCredential\n", "\n", - "subscription_id = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n", - "resource_group_name = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n", - "workspace_name = \"FILL IN WITH YOUR AZURE ML CREDENTIALS\"\n", - "\n", - "# Get a handle to the workspace\n", - "ml_client = MLClient(\n", + "# Get a handle to the workspace.\n", + "#\n", + "# Azure ML places the workspace config at the default working\n", + "# directory for notebooks by default.\n", + "#\n", + "# If it isn't found, open a shell and look in the\n", + "# directory indicated by 'echo ${JUPYTER_SERVER_ROOT}'.\n", + "ml_client = MLClient.from_config(\n", " credential=DefaultAzureCredential(),\n", - " subscription_id=subscription_id,\n", - " resource_group_name=resource_group_name,\n", - " workspace_name=workspace_name,\n", - ")\n", - "\n", - "print(\n", - " \"Workspace name: \" + ml_client.workspace_name,\n", - " \"Subscription id: \" + ml_client.subscription_id,\n", - " \"Resource group: \" + ml_client.resource_group_name,\n", - " sep=\"\\n\",\n", + " path=\"./config.json\",\n", ")" ] }, @@ -135,18 +93,11 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "data uri: \n", - " azureml://subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourcegroups/rapidsai-deployment/workspaces/rapids-aml-cluster/datastores/workspaceartifactstore/paths/airline_20000000.parquet\n" - ] - } - ], + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "datastore_name = \"workspaceartifactstore\"\n", "dataset = \"airline_20000000.parquet\"\n", @@ -184,33 +135,27 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "found compute target. Will use rapids-cluster\n" - ] - } - ], + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ "from azure.ai.ml.entities import AmlCompute\n", "from azure.ai.ml.exceptions import MlException\n", "\n", "# specify aml compute name.\n", - "gpu_compute_target = \"rapids-cluster\"\n", + "target_name = \"rapids-cluster\"\n", "\n", "try:\n", " # let's see if the compute target already exists\n", - " gpu_target = ml_client.compute.get(gpu_compute_target)\n", - " print(f\"found compute target. Will use {gpu_compute_target}\")\n", + " gpu_target = ml_client.compute.get(target_name)\n", + " print(f\"found compute target. Will use {gpu_target.name}\")\n", "except MlException:\n", " print(\"Creating a new gpu compute target...\")\n", "\n", " gpu_target = AmlCompute(\n", - " name=\"rapids-cluster\",\n", + " name=target_name,\n", " type=\"amlcompute\",\n", " size=\"STANDARD_NC12S_V3\",\n", " max_instances=5,\n", @@ -259,16 +204,6 @@ "These run metrics will become particularly important when we begin hyperparameter tuning our model in the 'Tune model hyperparameters' section." ] }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "rapids_script = \"./train_rapids.py\"\n", - "azure_script = \"./rapids_csp_azure.py\"" - ] - }, { "cell_type": "markdown", "metadata": { @@ -282,70 +217,58 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Create Experiment\n", - "\n", - "Track all the runs in your workspace" + "## Setup Environment" ] }, { - "cell_type": "code", - "execution_count": 6, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "experiment_name = \"test_rapids_aml_cluster\"" + "We'll be using a custom RAPIDS docker image to [setup the environment](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-environments-v2?tabs=python#create-an-environment-from-a-docker-image). This is available in `rapidsai/base` repo on [DockerHub](https://hub.docker.com/r/rapidsai/base/)." ] }, { - "cell_type": "markdown", - "metadata": {}, + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "## Setup Environment" + "%%bash\n", + "# create a Dockerfile defining the image the code will run in\n", + "cat > ./Dockerfile <=2024.4.4' \\\n", + " && pip install azureml-mlflow\n", + "EOF" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ - "We'll be using a custom RAPIDS docker image to [setup the environment](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-environments-v2?tabs=python#create-an-environment-from-a-docker-image). This is available in `rapidsai/rapidsai` repo on [DockerHub](https://hub.docker.com/r/rapidsai/rapidsai/).\n", - "\n", - "Make sure you have the correct path to the docker build context as `os.getcwd()`," + "Make sure you have the correct path to the docker build context as `os.getcwd()`." ] }, { "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32mUploading code (0.33 MBs): 100%|██████████| 325450/325450 [00:00<00:00, 2363322.62it/s]\n", - "\u001b[39m\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "Environment({'intellectual_property': None, 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'rapids-mlflow', 'description': 'RAPIDS environment with azureml-mlflow', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourceGroups/rapidsai-deployment/providers/Microsoft.MachineLearningServices/workspaces/rapids-aml-cluster/environments/rapids-mlflow/versions/10', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/skirui1/code', 'creation_context': , 'serialize': , 'version': '10', 'latest_version': None, 'conda_file': None, 'image': None, 'build': , 'inference_config': None, 'os_type': 'Linux', 'arm_type': 'environment_version', 'conda_file_path': None, 'path': None, 'datastore': None, 'upload_hash': None, 'translated_conda_file': None})" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# RUN THIS CODE ONCE TO SETUP ENVIRONMENT\n", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ "import os\n", "\n", "from azure.ai.ml.entities import BuildContext, Environment\n", "\n", "env_docker_image = Environment(\n", " build=BuildContext(path=os.getcwd()),\n", - " name=\"rapids-mlflow\",\n", + " name=\"rapids-hpo\",\n", " description=\"RAPIDS environment with azureml-mlflow\",\n", ")\n", "\n", @@ -369,46 +292,20 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "tags": [ "library/randomforest", "library/cudf" ] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.\n", - "\u001b[32mUploading code (0.33 MBs): 100%|██████████| 327210/327210 [00:00<00:00, 1802654.05it/s]\n", - "\u001b[39m\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "'https://ml.azure.com/runs/zen_eye_lm7dcp68jz?wsid=/subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourcegroups/rapidsai-deployment/workspaces/rapids-aml-cluster&tid=43083d15-7273-40c1-b7db-39efd9ccc17a'" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from azure.ai.ml import Input, command\n", "\n", "command_job = command(\n", - " environment=\"rapids-mlflow:1\",\n", - " experiment_name=experiment_name,\n", + " environment=f\"{env_docker_image.name}:{env_docker_image.version}\",\n", + " experiment_name=\"test_rapids_aml_hpo_cluster\",\n", " code=os.getcwd(),\n", " inputs={\n", " \"data_dir\": Input(type=\"uri_file\", path=data_uri),\n", @@ -419,12 +316,15 @@ " \"max_depth\": 6,\n", " \"max_features\": 0.3,\n", " },\n", - " command=(\n", - " \"python train_rapids.py --data_dir ${{inputs.data_dir}} --n_bins ${{inputs.n_bins}} \"\n", - " \"--compute ${{inputs.compute}} --cv_folds ${{inputs.cv_folds}} --n_estimators ${{inputs.n_estimators}} \"\n", - " \"--max_depth ${{inputs.max_depth}} --max_features ${{inputs.max_features}}\"\n", - " ),\n", - " compute=\"rapids-cluster\",\n", + " command=\"python train_rapids.py \\\n", + " --data_dir ${{inputs.data_dir}} \\\n", + " --n_bins ${{inputs.n_bins}} \\\n", + " --compute ${{inputs.compute}} \\\n", + " --cv_folds ${{inputs.cv_folds}} \\\n", + " --n_estimators ${{inputs.n_estimators}} \\\n", + " --max_depth ${{inputs.max_depth}} \\\n", + " --max_features ${{inputs.max_features}}\",\n", + " compute=gpu_target.name,\n", ")\n", "\n", "\n", @@ -465,8 +365,10 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, + "execution_count": null, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "from azure.ai.ml.sweep import Choice, Uniform\n", @@ -479,19 +381,18 @@ "\n", "# apply sweep parameter to obtain the sweep_job\n", "sweep_job = command_job_for_sweep.sweep(\n", - " compute=\"rapids-cluster\",\n", + " compute=gpu_target.name,\n", " sampling_algorithm=\"random\",\n", " primary_metric=\"Accuracy\",\n", " goal=\"Maximize\",\n", ")\n", "\n", "\n", - "# Define the limits for this sweep\n", + "# Relax these limits to run more trials\n", "sweep_job.set_limits(\n", - " max_total_trials=10, max_concurrent_trials=2, timeout=18000, trial_timeout=3600\n", + " max_total_trials=5, max_concurrent_trials=5, timeout=18000, trial_timeout=3600\n", ")\n", "\n", - "\n", "# Specify your experiment details\n", "sweep_job.display_name = \"RF-rapids-sweep-job\"\n", "sweep_job.description = \"Run RAPIDS hyperparameter sweep job\"" @@ -506,8 +407,10 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, + "execution_count": null, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# submit the hpo job\n", @@ -518,26 +421,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Monitor SweepJobs runs" + "## Monitor runs" ] }, { "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Monitor your job at https://ml.azure.com/runs/eager_turtle_r7fs2xzcty?wsid=/subscriptions/fc4f4a6b-4041-4b1c-8249-854d68edcf62/resourcegroups/rapidsai-deployment/workspaces/rapids-aml-cluster&tid=43083d15-7273-40c1-b7db-39efd9ccc17a\n" - ] - } - ], + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "aml_url = returned_sweep_job.studio_url\n", - "\n", - "print(\"Monitor your job at\", aml_url)" + "print(f\"Monitor your job at {returned_sweep_job.studio_url}\")" ] }, { @@ -556,8 +451,10 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, + "execution_count": null, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "ml_client.jobs.download(returned_sweep_job.name, output_name=\"model\")" @@ -572,11 +469,13 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, + "execution_count": null, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "ml_client.compute.begin_delete(gpu_compute_target).wait()" + "ml_client.compute.begin_delete(gpu_target.name).wait()" ] } ], @@ -585,9 +484,9 @@ "name": "rapids" }, "kernelspec": { - "display_name": "rapids-23.06", + "display_name": "rapids", "language": "python", - "name": "rapids-23.06" + "name": "rapids" }, "language_info": { "codemirror_mode": { @@ -599,7 +498,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.8" }, "microsoft": { "ms_spell_check": { diff --git a/source/examples/rapids-azureml-hpo/train_rapids.py b/source/examples/rapids-azureml-hpo/train_rapids.py index 63ce4f5f..7e941cba 100644 --- a/source/examples/rapids-azureml-hpo/train_rapids.py +++ b/source/examples/rapids-azureml-hpo/train_rapids.py @@ -20,7 +20,6 @@ import cudf import cuml import mlflow -import numpy as np from rapids_csp_azure import PerfTimer, RapidsCloudML @@ -62,13 +61,13 @@ def main(): cv_folds = args.cv_folds n_estimators = args.n_estimators - mlflow.log_param("n_estimators", np.int(args.n_estimators)) + mlflow.log_param("n_estimators", int(args.n_estimators)) max_depth = args.max_depth - mlflow.log_param("max_depth", np.int(args.max_depth)) + mlflow.log_param("max_depth", int(args.max_depth)) n_bins = args.n_bins - mlflow.log_param("n_bins", np.int(args.n_bins)) + mlflow.log_param("n_bins", int(args.n_bins)) max_features = args.max_features - mlflow.log_param("max_features", np.str(args.max_features)) + mlflow.log_param("max_features", str(args.max_features)) print("\n---->>>> cuDF version <<<<----\n", cudf.__version__) print("\n---->>>> cuML version <<<<----\n", cuml.__version__) @@ -156,9 +155,9 @@ def main(): global_best_test_accuracy = test_accuracy mlflow.log_metric( - "Total training inference time", np.float(training_time + infer_time) + "Total training inference time", float(training_time + infer_time) ) - mlflow.log_metric("Accuracy", np.float(global_best_test_accuracy)) + mlflow.log_metric("Accuracy", float(global_best_test_accuracy)) print("\n Accuracy :", global_best_test_accuracy) print("\n accuracy per fold :", accuracy_per_fold) print("\n train-time per fold :", train_time_per_fold) @@ -171,5 +170,5 @@ def main(): with PerfTimer() as total_script_time: main() print(f"Total runtime: {total_script_time.duration:.2f}") - mlflow.log_metric("Total runtime", np.float(total_script_time.duration)) + mlflow.log_metric("Total runtime", float(total_script_time.duration)) print("\n Exiting script") diff --git a/source/examples/rapids-optuna-hpo/notebook.ipynb b/source/examples/rapids-optuna-hpo/notebook.ipynb index 3f16ccf3..79c181a2 100644 --- a/source/examples/rapids-optuna-hpo/notebook.ipynb +++ b/source/examples/rapids-optuna-hpo/notebook.ipynb @@ -380,7 +380,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.10.11" } }, "nbformat": 4, diff --git a/source/images/azureml-download-config-file.png b/source/images/azureml-download-config-file.png deleted file mode 100644 index 76d9a1a9..00000000 Binary files a/source/images/azureml-download-config-file.png and /dev/null differ diff --git a/source/images/azureml-provision-setup-script.png b/source/images/azureml-provision-setup-script.png index d7ed288e..4fa05ebc 100644 Binary files a/source/images/azureml-provision-setup-script.png and b/source/images/azureml-provision-setup-script.png differ