From 48d18fbd4cf785e1f69a6555d97a39023a5d199e Mon Sep 17 00:00:00 2001
From: Banit Agrawal <bagrawal@meta.com>
Date: Tue, 17 Sep 2024 19:08:44 +0000
Subject: [PATCH] [PyTorch CUDA Allocator] Allow reuse of non-split blocks with
 better rounding (#136174)

Summary:
This diff adds an option to round the non-split blocks in caching allocator so that they can be reused without causing lots of fragmentation for large memory segments.

For example, if we specify max_split memory size as 400MB, then all allocations more than 400MB will not be split. Lets say, we allocated some 1024MB blocks and these are cached in the allocator blocks. If we request a new 500MB block, we round it to nearest power-2-division, thats 512MB, we add default kLargeBuffer of 20MB, that will be 532MB and since 532MB is less than existing 1024MB block, the 1024MB will not be used for this allocation, instead a new 512MB block will be created. In this diff, we provide an option to cofigure the kLargeBuffer for rounding and expose as a configurable option, so 512MB + max_non_split_rounding_size and if thats greater than 1024MB, we will use te 1024MB and we wont create a new 512MB block using cudaMalloc. This option is added so that we can pre-allocate some large blocks so that we can reuse them as much as possible and we dont stall on calling cudaMalloc.

Differential Revision: D62758758

Pull Request resolved: https://github.com/pytorch/pytorch/pull/136174
Approved by: https://github.com/zyan0
---
 c10/cuda/CUDAAllocatorConfig.cpp  | 25 +++++++++++++++++++++++++
 c10/cuda/CUDAAllocatorConfig.h    |  8 ++++++++
 c10/cuda/CUDACachingAllocator.cpp |  3 ++-
 docs/source/notes/cuda.rst        |  7 +++++++
 4 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
index 19aedb2cbb02f7..7b410bdd6ef557 100644
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -12,6 +12,7 @@ constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
 
 CUDAAllocatorConfig::CUDAAllocatorConfig()
     : m_max_split_size(std::numeric_limits<size_t>::max()),
+      m_max_non_split_rounding_size(kLargeBuffer),
       m_garbage_collection_threshold(0),
       m_pinned_num_register_threads(1),
       m_expandable_segments(false),
@@ -94,6 +95,27 @@ size_t CUDAAllocatorConfig::parseMaxSplitSize(
   return i;
 }
 
+size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  constexpr int mb = 1024 * 1024;
+  if (++i < config.size()) {
+    size_t val1 = stoi(config[i]);
+    TORCH_CHECK(
+        val1 > kLargeBuffer / mb,
+        "CachingAllocator option max_non_split_rounding_mb too small, must be > ",
+        kLargeBuffer / mb,
+        "");
+    val1 = std::max(val1, kLargeBuffer / mb);
+    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
+    m_max_non_split_rounding_size = val1 * 1024 * 1024;
+  } else {
+    TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", "");
+  }
+  return i;
+}
+
 size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold(
     const std::vector<std::string>& config,
     size_t i) {
@@ -258,6 +280,9 @@ void CUDAAllocatorConfig::parseArgs(const char* env) {
     if (config_item_view == "max_split_size_mb") {
       i = parseMaxSplitSize(config, i);
       used_native_specific_option = true;
+    } else if (config_item_view == "max_non_split_rounding_mb") {
+      i = parseMaxNonSplitRoundingSize(config, i);
+      used_native_specific_option = true;
     } else if (config_item_view == "garbage_collection_threshold") {
       i = parseGarbageCollectionThreshold(config, i);
       used_native_specific_option = true;
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index 3106fc1b46baee..38adc4732e3d6d 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -63,6 +63,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return instance().m_roundup_power2_divisions;
   }
 
+  static size_t max_non_split_rounding_size() {
+    return instance().m_max_non_split_rounding_size;
+  }
+
   static std::string last_allocator_settings() {
     std::lock_guard<std::mutex> lock(
         instance().m_last_allocator_settings_mutex);
@@ -90,6 +94,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
       size_t i,
       const char c);
   size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
+  size_t parseMaxNonSplitRoundingSize(
+      const std::vector<std::string>& config,
+      size_t i);
   size_t parseGarbageCollectionThreshold(
       const std::vector<std::string>& config,
       size_t i);
@@ -108,6 +115,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
       size_t i);
 
   std::atomic<size_t> m_max_split_size;
+  std::atomic<size_t> m_max_non_split_rounding_size;
   std::vector<size_t> m_roundup_power2_divisions;
   std::atomic<double> m_garbage_collection_threshold;
   std::atomic<size_t> m_pinned_num_register_threads;
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 3da3c6d4f5d05a..a67a720717bb7f 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -2527,7 +2527,8 @@ class DeviceCachingAllocator {
       return false;
     // Allow oversized block size to be rounded up but within a limit
     if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
-        ((*it)->size >= p.size() + kLargeBuffer))
+        ((*it)->size >=
+         p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
       return false;
     p.block = *it;
     pool.blocks.erase(it);
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 7d434bbbba64ce..c0b82adc7e073a 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -471,6 +471,13 @@ Available options:
   set the knob value to: [256:1,512:2,1024:4,>:8].
   ``roundup_power2_divisions`` is only meaningful with ``backend:native``.
   With ``backend:cudaMallocAsync``, ``roundup_power2_divisions`` is ignored.
+* ``max_non_split_rounding_mb`` will allow non-split blocks for better reuse, eg,
+   a 1024MB cached block can be re-used for a 512MB allocation request. In the default
+   case, we only allow up to 20MB of rounding of non-split blocks, so a 512MB block
+   can only be served with between 512-532 MB size block. If we set the value of this
+   option to 1024, it will alow 512-1536 MB size blocks to be used for a 512MB block
+   which increases reuse of larger blocks. This will also help in reducing the stalls
+   in avoiding expensive cudaMalloc calls.
 * ``garbage_collection_threshold`` helps actively reclaiming unused GPU memory to
   avoid triggering expensive sync-and-reclaim-all operation (release_cached_blocks),
   which can be unfavorable to latency-critical GPU applications (e.g., servers).