From f4821c53b9f3c3a961e944954a7b44954c11cd97 Mon Sep 17 00:00:00 2001 From: Adam Talbot <12817534+adamrtalbot@users.noreply.github.com> Date: Tue, 3 Dec 2024 17:09:41 +0000 Subject: [PATCH] Add GPU test (#30) Adds GPU test originally built by @robsyme. --- README.md | 6 ++++ main.nf | 86 +++++++++++++++++++++++++++++++++++++++++++- nextflow.config | 1 + nextflow_schema.json | 4 +++ 4 files changed, 96 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0639c03..4077ccb 100644 --- a/README.md +++ b/README.md @@ -124,3 +124,9 @@ Tests moving the contents of a folder to a new folder within the working directo ### `TEST_VAL_INPUT` Test a process can accept a value as input. + +### `TEST_GPU` + +_Note: Enabled only if the parameter `--gpu` is specified._ + +This process tests the ability to use a GPU. It uses the `pytorch` conda environment to test CUDA is available and working. This is disabled by default as it requires a GPU to be available which may not be true. diff --git a/main.nf b/main.nf index 1f6253c..a2c7484 100644 --- a/main.nf +++ b/main.nf @@ -252,10 +252,92 @@ process TEST_VAL_INPUT { """ } +process TEST_GPU { + + container 'pytorch/pytorch:latest' + conda 'pytorch::pytorch=2.5.1 pytorch::torchvision=0.20.1 nvidia::cuda=12.1' + accelerator 1 + memory '10G' + + input: + val input + + output: + stdout + + + script: + """ + #!/usr/bin/env python + import torch + import time + + # Function to print GPU and CUDA details + def print_gpu_info(): + if torch.cuda.is_available(): + gpu_name = torch.cuda.get_device_name(0) + cuda_version = torch.version.cuda + print(f"GPU: {gpu_name}") + print(f"CUDA Version: {cuda_version}") + else: + print("CUDA is not available on this system.") + + # Define a simple function to perform some calculations on the CPU + def cpu_computation(size): + x = torch.rand(size, size) + y = torch.rand(size, size) + result = torch.mm(x, y) + return result + + # Define a simple function to perform some calculations on the GPU + def gpu_computation(size): + x = torch.rand(size, size, device='cuda') + y = torch.rand(size, size, device='cuda') + result = torch.mm(x, y) + torch.cuda.synchronize() # Ensure the computation is done + return result + + # Print GPU and CUDA details + print_gpu_info() + + # Define the size of the matrices + size = 10000 + + # Measure time for CPU computation + start_time = time.time() + cpu_result = cpu_computation(size) + cpu_time = time.time() - start_time + print(f"CPU computation time: {cpu_time:.4f} seconds") + + # Measure time for GPU computation + start_time = time.time() + gpu_result = gpu_computation(size) + gpu_time = time.time() - start_time + print(f"GPU computation time: {gpu_time:.4f} seconds") + + # Optionally, verify that the results are close (they should be if the calculations are the same) + if torch.allclose(cpu_result, gpu_result.cpu()): + print("Results are close enough!") + else: + print("Results differ!") + + # Print the time difference + time_difference = cpu_time - gpu_time + print(f"Time difference (CPU - GPU): {time_difference:.4f} seconds") + + if time_difference < 0: + raise Exception("GPU is slower than CPU indicating no GPU utilization") + """ + +} + workflow NF_CANARY { main: + Channel.of('dummy') + .set { dummy } + // Create test file on head node Channel .of("alpha", "beta", "gamma") @@ -281,6 +363,7 @@ workflow NF_CANARY { TEST_MV_FOLDER_CONTENTS() TEST_VAL_INPUT("Hello World") + TEST_GPU( dummy.filter { params.gpu } ) // POC of emitting the channel Channel.empty() .mix( @@ -297,7 +380,8 @@ workflow NF_CANARY { TEST_PUBLISH_FOLDER.out, TEST_IGNORED_FAIL.out, TEST_MV_FILE.out, - TEST_MV_FOLDER_CONTENTS.out + TEST_MV_FOLDER_CONTENTS.out, + TEST_GPU.out ) .set { ch_out } diff --git a/nextflow.config b/nextflow.config index 17d5906..e7e4b34 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,5 +1,6 @@ params { skip = '' + gpu = false run = null outdir = null remoteFile = null diff --git a/nextflow_schema.json b/nextflow_schema.json index f1095bc..eeeff4f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -28,6 +28,10 @@ "help_text": "Path to a remote file to use within the pipeline. This mimics a remote set of files such as reference data that may need to be retrieved prior to analysis. By default this is not specified and the test is not ran, add a remote file using standard Nextflow filenaming to pull a file from your storage (e.g. an S3 bucket or shared storage).", "format": "path" }, + "gpu": { + "type": "boolean", + "description": "Whether to test GPU utilization within a process." + }, "outdir": { "type": "string", "format": "directory-path",