From a4b794e25ab784713343c345f4450648f88a8a3e Mon Sep 17 00:00:00 2001
From: Istvan Kiss <neon60@gmail.com>
Date: Fri, 10 May 2024 12:14:21 +0200
Subject: [PATCH] Update Cooperative Groups How to

---
 docs/how-to/cooperative_groups.rst | 279 ++++++++++++++++++++++++++++-
 1 file changed, 275 insertions(+), 4 deletions(-)

diff --git a/docs/how-to/cooperative_groups.rst b/docs/how-to/cooperative_groups.rst
index 4781e50713..d765a61bf5 100644
--- a/docs/how-to/cooperative_groups.rst
+++ b/docs/how-to/cooperative_groups.rst
@@ -6,7 +6,278 @@
 Cooperative Groups in HIP
 *******************************************************************************
 
-The Cooperative Groups API introduces new APIs to launch, group, subdivide,
-synchronize and identify threads, as well as some predefined group-collective
-algorithms, but most importantly a matching threading model to think in terms
-of.
\ No newline at end of file
+The Cooperative Groups is an extension of the exsiting ROCm programming model, 
+to get a more flexible grouping mechanism for the Developers. This feature was
+introduced in CUDA 9 first, 
+
+The API accessable in the ``cooperative_groups`` namespace after the 
+``cooperative_groups.h`` is included. The header contains the following 
+elements:
+
+* Data types for representing groups
+* Operations to generate implicit groups defined;
+* Collectives for partitioning existing groups into new groups;
+* Operation to synchronize all threads within the group;
+* Operations to inspect the group properties;
+* Collectives that expose low-level, group-specific and often HW accelerated, operations.
+
+The code differenc to the original block model can be find in the following table. 
+
+.. list-table:: Cooperative Group Example
+    :header-rows: 1
+    :widths: 50,50
+
+    * - **Original Block**
+      - **Cooperative Groups**
+
+    * - .. code-block:: C++
+          
+          __device__ int reduce_sum(int *shared, int val) {
+              
+              // Thread ID
+              const unsigned int thread_id = threadIdx.x;
+
+              // Every iteration the number of active threads halves,
+              // until we processed all values
+              for(unsigned int i = blockDim.x / 2; i > 0; i /= 2) {
+                // Store value in shared memroy with thread ID
+                shared[thread_id] = val;
+
+                // Synchronize all threads
+                __syncthreads();
+
+                // Active thread sum up
+                if(thread_id < i)
+                    val += shared[thread_id + i];
+
+                // Synchronize all threads in the group
+                g.sync();
+              }
+
+              // ...
+          }
+
+      - .. code-block:: C++
+
+          __device__ int reduce_sum(thread_group g, int *shared, int val) {
+
+            // Thread ID
+            const unsigned int group_thread_id = g.thread_rank();
+
+            // Every iteration the number of active threads halves,
+            // until we processed all values
+            for(unsigned int i = g.size() / 2; i > 0; i /= 2) {
+              // Store value in shared memroy with thread ID
+              shared[group_thread_id] = val;
+
+              // Synchronize all threads in the group
+              g.sync();
+
+              // Active thread sum up
+              if(group_thread_id < i)
+                val += shared[group_thread_id + i];
+
+              // Synchronize all threads in the group
+              g.sync();
+            }
+
+            // ...
+          }
+
+    * - .. code-block:: C++
+
+          __global__ void sum_kernel(...) {
+            // ...
+    
+            // Workspace array in shared memory
+            __shared__ unsigned int workspace[2048];
+
+            // ...
+
+
+            // Perform reduction
+            output = reduce_sum(workspace, input);
+
+            // ...
+          }
+
+      - .. code-block:: C++
+
+          __global__ void sum_kernel(...) {
+            // ...
+
+            // Workspace array in shared memory
+            __shared__ unsigned int workspace[2048];
+
+            // ...
+
+            thread_block thread_block_group = this_thread_block();
+            // Perform reduction
+            output = reduce_sum(thread_block_group, workspace, input);
+
+            // ...
+          }
+
+The kernel launch also different at Cooperative Groups case, which depends on the 
+cooperative group types. St grid groups with single GPU case the ``hipLaunchCooperativeKernel``
+has to be used.
+
+Group Types
+=============
+
+Thread Block Group
+--------------------
+
+Cluster Group
+---------------
+
+Grid Group
+------------
+
+Multi Grid Group
+------------------
+
+Thread Block Tile
+------------------
+
+Coalesced Groups
+------------------
+
+Synchronization
+=================
+
+
+Intra-Workgroup or Intra-Block Synchronization
+-----------------------------------------------
+
+Grid Synchronization
+---------------------
+
+Check the cooperative launch capabality on single AMD GPU:
+
+.. code-block:: C++
+
+    int device               = 0;
+    int supports_coop_launch = 0;
+    // Check support
+    // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices
+    HIP_CHECK(hipGetDevice(&device));
+    HIP_CHECK(
+        hipDeviceGetAttribute(&supports_coop_launch, hipDeviceAttributeCooperativeLaunch, device));
+    if(!supports_coop_launch)
+    {
+        std::cout << "Skipping, device " << device << " does not support cooperative groups"
+                  << std::endl;
+        return 0;
+    }
+
+Launch the cooperative kernel on single GPU:
+
+.. code-block:: C++
+
+    void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced};
+    // Launching kernel from host.
+    HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel<partition_size>,
+                                         dim3(num_blocks),
+                                         dim3(threads_per_block),
+                                         params,
+                                         0,
+                                         hipStreamDefault));
+
+
+The device side synchronization over the single GPU:
+
+.. code-block:: C++
+
+  grid_group grid = this_grid();
+  grid.sync();
+
+Multi-GPU Synchronization
+-----------------------------
+
+Check the cooperative launch capabality over the multiple GPUs:
+
+.. code-block:: C++
+  
+  #ifdef __HIP_PLATFORM_AMD__
+    int device               = 0;
+    int supports_coop_launch = 0;
+    // Check support
+    // Use hipDeviceAttributeCooperativeMultiDeviceLaunch when launching across multiple devices
+    for (int i = 0; i < numGPUs; i++) {
+      HIP_CHECK(hipGetDevice(&device));
+      HIP_CHECK(
+          hipDeviceGetAttribute(
+            &supports_coop_launch, 
+            hipDeviceAttributeCooperativeMultiDeviceLaunch, 
+            device));
+      if(!supports_coop_launch)
+      {
+          std::cout << "Skipping, device " << device << " does not support cooperative groups"
+                    << std::endl;
+          return 0;
+      }
+    }
+  #endif
+
+Launch the cooperative kernel on single GPU:
+
+.. code-block:: C++
+
+    void* params[] = {&d_vector, &d_block_reduced, &d_partition_reduced};
+    // Launching kernel from host.
+    HIP_CHECK(hipLaunchCooperativeKernel(vector_reduce_kernel<partition_size>,
+                                         dim3(num_blocks),
+                                         dim3(threads_per_block),
+                                         params,
+                                         0,
+                                         hipStreamDefault));
+
+The device side synchronization over the multiple GPU:
+
+.. code-block:: C++
+
+  multi_grid_group multi_grid = this_multi_grid();
+  multi_grid.sync();
+
+
+
+Missing CUDA features
+======================
+
+The following CUDA optional headers are not supported on HIP:
+
+.. code-block:: C++
+
+    // Optionally include for memcpy_async() collective
+    #include <cooperative_groups/memcpy_async.h>
+    // Optionally include for reduce() collective
+    #include <cooperative_groups/reduce.h>
+    // Optionally include for inclusive_scan() and exclusive_scan() collectives
+    #include <cooperative_groups/scan.h>
+
+The kernel
+
+.. list-table:: Missing Cooperative features
+
+    * - **Function**
+      - **Supported on Host**
+      - **Supported on Device**
+
+
+8.2.1. CUDA 12.2
+barrier_arrive and barrier_wait member functions were added for grid_group and thread_block. Description of the API is available here.
+
+8.2.2. CUDA 12.1
+invoke_one and invoke_one_broadcast APIs were added.
+
+8.2.3. CUDA 12.0
+The following experimental APIs are now moved to the main namespace:
+
+asynchronous reduce and scan update added in CUDA 11.7
+
+thread_block_tile larger than 32 added in CUDA 11.1
+
+It is no longer required to provide memory using the block_tile_memory object in order to create these large tiles on Compute Capability 8.0 or higher.
+
+8.3. Programming Model Concept
\ No newline at end of file