From a4b794e25ab784713343c345f4450648f88a8a3e Mon Sep 17 00:00:00 2001 From: Istvan Kiss Date: Fri, 10 May 2024 12:14:21 +0200 Subject: [PATCH] Update Cooperative Groups How to --- docs/how-to/cooperative_groups.rst | 279 ++++++++++++++++++++++++++++- 1 file changed, 275 insertions(+), 4 deletions(-) diff --git a/docs/how-to/cooperative_groups.rst b/docs/how-to/cooperative_groups.rst index 4781e50713..d765a61bf5 100644 --- a/docs/how-to/cooperative_groups.rst +++ b/docs/how-to/cooperative_groups.rst @@ -6,7 +6,278 @@ Cooperative Groups in HIP ******************************************************************************* -The Cooperative Groups API introduces new APIs to launch, group, subdivide, -synchronize and identify threads, as well as some predefined group-collective -algorithms, but most importantly a matching threading model to think in terms -of. \ No newline at end of file +The Cooperative Groups is an extension of the exsiting ROCm programming model, +to get a more flexible grouping mechanism for the Developers. This feature was +introduced in CUDA 9 first, + +The API accessable in the ``cooperative_groups`` namespace after the +``cooperative_groups.h`` is included. The header contains the following +elements: + +* Data types for representing groups +* Operations to generate implicit groups defined; +* Collectives for partitioning existing groups into new groups; +* Operation to synchronize all threads within the group; +* Operations to inspect the group properties; +* Collectives that expose low-level, group-specific and often HW accelerated, operations. + +The code differenc to the original block model can be find in the following table. + +.. list-table:: Cooperative Group Example + :header-rows: 1 + :widths: 50,50 + + * - **Original Block** + - **Cooperative Groups** + + * - .. code-block:: C++ + + __device__ int reduce_sum(int *shared, int val) { + + // Thread ID + const unsigned int thread_id = threadIdx.x; + + // Every iteration the number of active threads halves, + // until we processed all values + for(unsigned int i = blockDim.x / 2; i > 0; i /= 2) { + // Store value in shared memroy with thread ID + shared[thread_id] = val; + + // Synchronize all threads + __syncthreads(); + + // Active thread sum up + if(thread_id < i) + val += shared[thread_id + i]; + + // Synchronize all threads in the group + g.sync(); + } + + // ... + } + + - .. code-block:: C++ + + __device__ int reduce_sum(thread_group g, int *shared, int val) { + + // Thread ID + const unsigned int group_thread_id = g.thread_rank(); + + // Every iteration the number of active threads halves, + // until we processed all values + for(unsigned int i = g.size() / 2; i > 0; i /= 2) { + // Store value in shared memroy with thread ID + shared[group_thread_id] = val; + + // Synchronize all threads in the group + g.sync(); + + // Active thread sum up + if(group_thread_id < i) + val += shared[group_thread_id + i]; + + // Synchronize all threads in the group + g.sync(); + } + + // ... + } + + * - .. code-block:: C++ + + __global__ void sum_kernel(...) { + // ... + + // Workspace array in shared memory + __shared__ unsigned int workspace[2048]; + + // ... + + + // Perform reduction + output = reduce_sum(workspace, input); + + // ... + } + + - .. code-block:: C++ + + __global__ void sum_kernel(...) { + // ... + + // Workspace array in shared memory + __shared__ unsigned int workspace[2048]; + + // ... + + thread_block thread_block_group = this_thread_block(); + // Perform reduction + output = reduce_sum(thread_block_group, workspace, input); + + // ... + } + +The kernel launch also different at Cooperative Groups case, which depends on the +cooperative group types. St grid groups with single GPU case the ``hipLaunchCooperativeKernel`` +has to be used. + +Group Types +============= + +Thread Block Group +-------------------- + +Cluster Group +--------------- + +Grid Group +------------ + +Multi Grid Group +------------------ + +Thread Block Tile +------------------ + +Coalesced Groups +------------------ + +Synchronization +================= + + +Intra-Workgroup or Intra-Block Synchronization +----------------------------------------------- + +Grid Synchronization +--------------------- + +Check the cooperative launch capabality on single AMD GPU: + +.. code-block:: C++ + + int device = 0; + int supports_coop_launch = 0; + // Check support + // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK( + hipDeviceGetAttribute(&supports_coop_launch, hipDeviceAttributeCooperativeLaunch, device)); + if(!supports_coop_launch) + { + std::cout << "Skipping, device " << device << " does not support cooperative groups" + << std::endl; + return 0; + } + +Launch the cooperative kernel on single GPU: + +.. code-block:: C++ + + void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced}; + // Launching kernel from host. + HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel, + dim3(num_blocks), + dim3(threads_per_block), + params, + 0, + hipStreamDefault)); + + +The device side synchronization over the single GPU: + +.. code-block:: C++ + + grid_group grid = this_grid(); + grid.sync(); + +Multi-GPU Synchronization +----------------------------- + +Check the cooperative launch capabality over the multiple GPUs: + +.. code-block:: C++ + + #ifdef __HIP_PLATFORM_AMD__ + int device = 0; + int supports_coop_launch = 0; + // Check support + // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices + for (int i = 0; i < numGPUs; i++) { + HIP_CHECK(hipGetDevice(&device)); + HIP_CHECK( + hipDeviceGetAttribute( + &supports_coop_launch, + hipDeviceAttributeCooperativeMultiDeviceLaunch, + device)); + if(!supports_coop_launch) + { + std::cout << "Skipping, device " << device << " does not support cooperative groups" + << std::endl; + return 0; + } + } + #endif + +Launch the cooperative kernel on single GPU: + +.. code-block:: C++ + + void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced}; + // Launching kernel from host. + HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel, + dim3(num_blocks), + dim3(threads_per_block), + params, + 0, + hipStreamDefault)); + +The device side synchronization over the multiple GPU: + +.. code-block:: C++ + + multi_grid_group multi_grid = this_multi_grid(); + multi_grid.sync(); + + + +Missing CUDA features +====================== + +The following CUDA optional headers are not supported on HIP: + +.. code-block:: C++ + + // Optionally include for memcpy_async() collective + #include + // Optionally include for reduce() collective + #include + // Optionally include for inclusive_scan() and exclusive_scan() collectives + #include + +The kernel + +.. list-table:: Missing Cooperative features + + * - **Function** + - **Supported on Host** + - **Supported on Device** + + +8.2.1. CUDA 12.2 +barrier_arrive and barrier_wait member functions were added for grid_group and thread_block. Description of the API is available here. + +8.2.2. CUDA 12.1 +invoke_one and invoke_one_broadcast APIs were added. + +8.2.3. CUDA 12.0 +The following experimental APIs are now moved to the main namespace: + +asynchronous reduce and scan update added in CUDA 11.7 + +thread_block_tile larger than 32 added in CUDA 11.1 + +It is no longer required to provide memory using the block_tile_memory object in order to create these large tiles on Compute Capability 8.0 or higher. + +8.3. Programming Model Conceptīƒ \ No newline at end of file