From ca2cf28606164f8dc8ad09f32124f7bd2c962426 Mon Sep 17 00:00:00 2001
From: v01dstar <yang.zhang@pingcap.com>
Date: Thu, 5 Sep 2024 12:16:21 -0700
Subject: [PATCH] Add write amp based rate limiter

Signed-off-by: v01dstar <yang.zhang@pingcap.com>
Signed-off-by: Yang Zhang <yang.zhang@pingcap.com>
---
 CMakeLists.txt                                |   2 +
 Makefile                                      |   3 +
 TARGETS                                       |   7 +
 db/c.cc                                       |  10 +
 db/column_family.cc                           |  22 +-
 db/column_family.h                            |   3 +-
 include/rocksdb/c.h                           |   4 +
 include/rocksdb/rate_limiter.h                |  13 +
 src.mk                                        |   2 +
 .../write_amp_based_rate_limiter.cc           | 466 ++++++++++++++++++
 .../write_amp_based_rate_limiter.h            | 172 +++++++
 .../write_amp_based_rate_limiter_test.cc      | 207 ++++++++
 12 files changed, 906 insertions(+), 5 deletions(-)
 create mode 100644 utilities/rate_limiters/write_amp_based_rate_limiter.cc
 create mode 100644 utilities/rate_limiters/write_amp_based_rate_limiter.h
 create mode 100644 utilities/rate_limiters/write_amp_based_rate_limiter_test.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23a4014bc08..3a6c9bba6ad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -912,6 +912,7 @@ set(SOURCES
         utilities/persistent_cache/block_cache_tier_metadata.cc
         utilities/persistent_cache/persistent_cache_tier.cc
         utilities/persistent_cache/volatile_tier_impl.cc
+        utilities/rate_limiters/write_amp_based_rate_limiter.cc
         utilities/simulator_cache/cache_simulator.cc
         utilities/simulator_cache/sim_cache.cc
         utilities/table_properties_collectors/compact_on_deletion_collector.cc
@@ -1450,6 +1451,7 @@ if(WITH_TESTS)
         utilities/options/options_util_test.cc
         utilities/persistent_cache/hash_table_test.cc
         utilities/persistent_cache/persistent_cache_test.cc
+        utilities/rate_limiters/write_amp_based_rate_limiter_test.cc
         utilities/simulator_cache/cache_simulator_test.cc
         utilities/simulator_cache/sim_cache_test.cc
         utilities/table_properties_collectors/compact_on_deletion_collector_test.cc
diff --git a/Makefile b/Makefile
index c942d148dc1..d14c6a8837a 100644
--- a/Makefile
+++ b/Makefile
@@ -1990,6 +1990,9 @@ wide_column_serialization_test: $(OBJ_DIR)/db/wide/wide_column_serialization_tes
 wide_columns_helper_test: $(OBJ_DIR)/db/wide/wide_columns_helper_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)
 
+write_amp_based_rate_limiter_test: $(OBJ_DIR)/utilities/rate_limiters/write_amp_based_rate_limiter_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 #-------------------------------------------------
 # make install related stuff
 PREFIX ?= /usr/local
diff --git a/TARGETS b/TARGETS
index e8aaf325d46..394fe6e636d 100644
--- a/TARGETS
+++ b/TARGETS
@@ -310,6 +310,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
         "utilities/persistent_cache/block_cache_tier_metadata.cc",
         "utilities/persistent_cache/persistent_cache_tier.cc",
         "utilities/persistent_cache/volatile_tier_impl.cc",
+        "utilities/rate_limiters/write_amp_based_rate_limiter.cc",
         "utilities/simulator_cache/cache_simulator.cc",
         "utilities/simulator_cache/sim_cache.cc",
         "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
@@ -5574,6 +5575,12 @@ cpp_unittest_wrapper(name="work_queue_test",
             extra_compiler_flags=[])
 
 
+cpp_unittest_wrapper(name="write_amp_based_rate_limiter_test",
+            srcs=["utilities/rate_limiters/write_amp_based_rate_limiter_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="write_batch_test",
             srcs=["db/write_batch_test.cc"],
             deps=[":rocksdb_test_lib"],
diff --git a/db/c.cc b/db/c.cc
index 5555ae19875..92095dbc700 100644
--- a/db/c.cc
+++ b/db/c.cc
@@ -47,6 +47,7 @@
 #include "rocksdb/write_batch.h"
 #include "rocksdb/write_buffer_manager.h"
 #include "utilities/merge_operators.h"
+#include "utilities/rate_limiters/write_amp_based_rate_limiter.h"
 
 using ROCKSDB_NAMESPACE::BackupEngine;
 using ROCKSDB_NAMESPACE::BackupEngineOptions;
@@ -95,6 +96,7 @@ using ROCKSDB_NAMESPACE::NewCompactOnDeletionCollectorFactory;
 using ROCKSDB_NAMESPACE::NewGenericRateLimiter;
 using ROCKSDB_NAMESPACE::NewLRUCache;
 using ROCKSDB_NAMESPACE::NewRibbonFilterPolicy;
+using ROCKSDB_NAMESPACE::NewWriteAmpBasedRateLimiter;
 using ROCKSDB_NAMESPACE::OptimisticTransactionDB;
 using ROCKSDB_NAMESPACE::OptimisticTransactionOptions;
 using ROCKSDB_NAMESPACE::Options;
@@ -3997,6 +3999,14 @@ rocksdb_ratelimiter_t* rocksdb_ratelimiter_create_auto_tuned(
   return rate_limiter;
 }
 
+rocksdb_ratelimiter_t* rocksdb_writeampbasedratelimiter_create(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness) {
+  rocksdb_ratelimiter_t* rate_limiter = new rocksdb_ratelimiter_t;
+  rate_limiter->rep.reset(NewWriteAmpBasedRateLimiter(
+      rate_bytes_per_sec, refill_period_us, fairness));
+  return rate_limiter;
+}
+
 void rocksdb_ratelimiter_destroy(rocksdb_ratelimiter_t* limiter) {
   delete limiter;
 }
diff --git a/db/column_family.cc b/db/column_family.cc
index 9782cd31a70..9b5e6590c77 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -622,7 +622,8 @@ ColumnFamilyData::ColumnFamilyData(
     }
   }
 
-  RecalculateWriteStallConditions(mutable_cf_options_);
+  RecalculateWriteStallConditions(mutable_cf_options_,
+                                  ioptions_.rate_limiter.get());
 
   if (cf_options.table_factory->IsInstanceOf(
           TableFactory::kBlockBasedTableName()) &&
@@ -935,7 +936,7 @@ ColumnFamilyData::GetWriteStallConditionAndCause(
 }
 
 WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
-    const MutableCFOptions& mutable_cf_options) {
+    const MutableCFOptions& mutable_cf_options, RateLimiter* rate_limiter) {
   auto write_stall_condition = WriteStallCondition::kNormal;
   if (current_ != nullptr) {
     auto* vstorage = current_->storage_info();
@@ -1064,6 +1065,9 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
         // compaction.
         write_controller_token_ =
             write_controller->GetCompactionPressureToken();
+        if (rate_limiter) {
+          rate_limiter->PaceUp(false /*critical*/);
+        }
       } else if (vstorage->estimated_compaction_needed_bytes() >=
                  GetPendingCompactionBytesForCompactionSpeedup(
                      mutable_cf_options, vstorage)) {
@@ -1093,6 +1097,16 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
                                                                     4);
       }
     }
+    if (rate_limiter) {
+      // pace up limiter when close to write stall
+      if (write_stall_condition != WriteStallCondition::kNormal ||
+          vstorage->l0_delay_trigger_count() >=
+              0.8 * mutable_cf_options.level0_slowdown_writes_trigger ||
+          vstorage->estimated_compaction_needed_bytes() >=
+              0.5 * mutable_cf_options.soft_pending_compaction_bytes_limit) {
+        rate_limiter->PaceUp(true /*critical*/);
+      }
+    }
     prev_compaction_needed_bytes_ = compaction_needed_bytes;
   }
   return write_stall_condition;
@@ -1320,8 +1334,8 @@ void ColumnFamilyData::InstallSuperVersion(
     // Should not recalculate slow down condition if nothing has changed, since
     // currently RecalculateWriteStallConditions() treats it as further slowing
     // down is needed.
-    super_version_->write_stall_condition =
-        RecalculateWriteStallConditions(mutable_cf_options);
+    super_version_->write_stall_condition = RecalculateWriteStallConditions(
+        mutable_cf_options, ioptions_.rate_limiter.get());
   } else {
     super_version_->write_stall_condition =
         old_superversion->write_stall_condition;
diff --git a/db/column_family.h b/db/column_family.h
index 2a38feb7310..067b419dbba 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -478,7 +478,8 @@ class ColumnFamilyData {
   // Recalculate some stall conditions, which are changed only during
   // compaction, adding new memtable and/or recalculation of compaction score.
   WriteStallCondition RecalculateWriteStallConditions(
-      const MutableCFOptions& mutable_cf_options);
+      const MutableCFOptions& mutable_cf_options,
+      RateLimiter* rate_limiter = nullptr);
 
   void set_initialized() { initialized_.store(true); }
 
diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h
index 8a26585fe73..f98406dddec 100644
--- a/include/rocksdb/c.h
+++ b/include/rocksdb/c.h
@@ -1681,6 +1681,10 @@ extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t*
 rocksdb_ratelimiter_create_auto_tuned(int64_t rate_bytes_per_sec,
                                       int64_t refill_period_us,
                                       int32_t fairness);
+extern ROCKSDB_LIBRARY_API rocksdb_ratelimiter_t*
+rocksdb_writeampbasedratelimiter_create(int64_t rate_bytes_per_sec,
+                                        int64_t refill_period_us,
+                                        int32_t fairness);
 extern ROCKSDB_LIBRARY_API void rocksdb_ratelimiter_destroy(
     rocksdb_ratelimiter_t*);
 
diff --git a/include/rocksdb/rate_limiter.h b/include/rocksdb/rate_limiter.h
index 3515b1e953b..1939843d6da 100644
--- a/include/rocksdb/rate_limiter.h
+++ b/include/rocksdb/rate_limiter.h
@@ -48,6 +48,8 @@ class RateLimiter {
   virtual Status SetSingleBurstBytes(int64_t /* single_burst_bytes */) {
     return Status::NotSupported();
   }
+  // Dynamically change rate limiter's auto_tuned mode.
+  virtual void SetAutoTuned(bool /*auto_tuned*/) {}
 
   // Deprecated. New RateLimiter derived classes should override
   // Request(const int64_t, const Env::IOPriority, Statistics*) or
@@ -120,6 +122,8 @@ class RateLimiter {
 
   virtual int64_t GetBytesPerSecond() const = 0;
 
+  virtual bool GetAutoTuned() const { return false; }
+
   virtual bool IsRateLimited(OpType op_type) {
     if ((mode_ == RateLimiter::Mode::kWritesOnly &&
          op_type == RateLimiter::OpType::kRead) ||
@@ -130,6 +134,8 @@ class RateLimiter {
     return true;
   }
 
+  virtual void PaceUp(bool /*critical*/) {}
+
  protected:
   Mode GetMode() { return mode_; }
 
@@ -165,4 +171,11 @@ extern RateLimiter* NewGenericRateLimiter(
     RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly,
     bool auto_tuned = false);
 
+extern RateLimiter* NewWriteAmpBasedRateLimiter(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us = 100 * 1000,
+    int32_t fairness = 10,
+    RateLimiter::Mode mode = RateLimiter::Mode::kWritesOnly,
+    bool auto_tuned = false, int tune_per_sec = 1,
+    size_t smooth_window_size = 300, size_t recent_window_size = 30);
+
 }  // namespace ROCKSDB_NAMESPACE
diff --git a/src.mk b/src.mk
index a03a476ff15..11be8b271b3 100644
--- a/src.mk
+++ b/src.mk
@@ -298,6 +298,7 @@ LIB_SOURCES =                                                   \
   utilities/persistent_cache/block_cache_tier_metadata.cc       \
   utilities/persistent_cache/persistent_cache_tier.cc           \
   utilities/persistent_cache/volatile_tier_impl.cc              \
+  utilities/rate_limiters/write_amp_based_rate_limiter.cc       \
   utilities/simulator_cache/cache_simulator.cc                  \
   utilities/simulator_cache/sim_cache.cc                        \
   utilities/table_properties_collectors/compact_on_deletion_collector.cc \
@@ -619,6 +620,7 @@ TEST_MAIN_SOURCES =                                                     \
   utilities/options/options_util_test.cc                                \
   utilities/persistent_cache/hash_table_test.cc                         \
   utilities/persistent_cache/persistent_cache_test.cc                   \
+  utilities/rate_limiters/write_amp_based_rate_limiter_test.cc          \
   utilities/simulator_cache/cache_simulator_test.cc                     \
   utilities/simulator_cache/sim_cache_test.cc                           \
   utilities/table_properties_collectors/compact_on_deletion_collector_test.cc  \
diff --git a/utilities/rate_limiters/write_amp_based_rate_limiter.cc b/utilities/rate_limiters/write_amp_based_rate_limiter.cc
new file mode 100644
index 00000000000..55787d5b87f
--- /dev/null
+++ b/utilities/rate_limiters/write_amp_based_rate_limiter.cc
@@ -0,0 +1,466 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "utilities/rate_limiters/write_amp_based_rate_limiter.h"
+
+#include "monitoring/statistics_impl.h"
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "test_util/sync_point.h"
+#include "util/aligned_buffer.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// Pending request
+struct WriteAmpBasedRateLimiter::Req {
+  explicit Req(int64_t _bytes, port::Mutex* _mu)
+      : request_bytes(_bytes), bytes(_bytes), cv(_mu), granted(false) {}
+  int64_t request_bytes;
+  int64_t bytes;
+  port::CondVar cv;
+  bool granted;
+};
+
+namespace {
+// Due to the execution model of compaction, large waves of pending compactions
+// could possibly be hidden behind a constant rate of I/O requests. It's then
+// wise to raise the threshold slightly above estimation to ensure those
+// pending compactions can contribute to the convergence of a new alternative
+// threshold.
+// Padding is calculated through hyperbola based on empirical percentage of 10%
+// and special care for low-pressure domain. E.g. coordinates (5M, 18M) and
+// (10M, 16M) are on this curve.
+int64_t CalculatePadding(int64_t base) {
+  return base / 10 + 577464606419583ll / (base + 26225305);
+}
+}  // unnamed namespace
+
+WriteAmpBasedRateLimiter::WriteAmpBasedRateLimiter(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us, int32_t fairness,
+    RateLimiter::Mode mode, Env* env, bool auto_tuned, int secs_per_tune,
+    size_t smooth_window_size, size_t recent_window_size)
+    : RateLimiter(mode),
+      refill_period_us_(refill_period_us),
+      rate_bytes_per_sec_(auto_tuned ? rate_bytes_per_sec / 2
+                                     : rate_bytes_per_sec),
+      refill_bytes_per_period_(
+          CalculateRefillBytesPerPeriod(rate_bytes_per_sec_)),
+      env_(env),
+      stop_(false),
+      exit_cv_(&request_mutex_),
+      requests_to_wait_(0),
+      available_bytes_(0),
+      next_refill_us_(NowMicrosMonotonic(env_)),
+      fairness_(fairness > 100 ? 100 : fairness),
+      rnd_((uint32_t)time(nullptr)),
+      leader_(nullptr),
+      auto_tuned_(auto_tuned),
+      secs_per_tune_(secs_per_tune == 0 ? 1 : secs_per_tune),
+      max_bytes_per_sec_(rate_bytes_per_sec),
+      tuned_time_(NowMicrosMonotonic(env_)),
+      duration_highpri_bytes_through_(0),
+      duration_bytes_through_(0),
+      bytes_sampler_(smooth_window_size, recent_window_size),
+      highpri_bytes_sampler_(smooth_window_size, recent_window_size),
+      limit_bytes_sampler_(recent_window_size, recent_window_size),
+      critical_pace_up_(false),
+      normal_pace_up_(false),
+      percent_delta_(0) {
+  std::fill(total_requests_, total_requests_ + Env::IO_TOTAL, 0);
+  std::fill(total_bytes_through_, total_bytes_through_ + Env::IO_TOTAL, 0);
+}
+
+WriteAmpBasedRateLimiter::~WriteAmpBasedRateLimiter() {
+  MutexLock g(&request_mutex_);
+  stop_ = true;
+  for (auto i = 0; i < Env::IO_TOTAL; ++i) {
+    requests_to_wait_ += queue_[i].size();
+    for (auto& r : queue_[i]) {
+      r->cv.Signal();
+    }
+  }
+  while (requests_to_wait_ > 0) {
+    exit_cv_.Wait();
+  }
+}
+
+void WriteAmpBasedRateLimiter::SetBytesPerSecond(int64_t bytes_per_second) {
+  assert(bytes_per_second > 0);
+  if (auto_tuned_.load(std::memory_order_acquire)) {
+    max_bytes_per_sec_.store(bytes_per_second, std::memory_order_relaxed);
+  } else {
+    SetActualBytesPerSecond(bytes_per_second);
+  }
+}
+
+void WriteAmpBasedRateLimiter::SetAutoTuned(bool auto_tuned) {
+  MutexLock g(&auto_tuned_mutex_);
+  if (auto_tuned_.load(std::memory_order_acquire) != auto_tuned) {
+    if (auto_tuned) {
+      max_bytes_per_sec_.store(rate_bytes_per_sec_, std::memory_order_relaxed);
+      refill_bytes_per_period_.store(
+          CalculateRefillBytesPerPeriod(rate_bytes_per_sec_),
+          std::memory_order_relaxed);
+    } else {
+      // must hold this lock to avoid tuner changing `rate_bytes_per_sec_`
+      MutexLock g2(&request_mutex_);
+      rate_bytes_per_sec_ = max_bytes_per_sec_.load(std::memory_order_relaxed);
+      refill_bytes_per_period_.store(
+          CalculateRefillBytesPerPeriod(rate_bytes_per_sec_),
+          std::memory_order_relaxed);
+    }
+    auto_tuned_.store(auto_tuned, std::memory_order_release);
+  }
+}
+
+void WriteAmpBasedRateLimiter::SetActualBytesPerSecond(
+    int64_t bytes_per_second) {
+  rate_bytes_per_sec_ = bytes_per_second;
+  refill_bytes_per_period_.store(
+      CalculateRefillBytesPerPeriod(bytes_per_second),
+      std::memory_order_relaxed);
+}
+
+bool WriteAmpBasedRateLimiter::IsFrontOfOneQueue(Req* r) {
+  request_mutex_.AssertHeld();
+  for (auto i = 0; i < Env::IO_TOTAL; ++i) {
+    if (!queue_[i].empty() && r == queue_[i].front()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void WriteAmpBasedRateLimiter::Request(int64_t bytes, const Env::IOPriority pri,
+                                       Statistics* stats) {
+  TEST_SYNC_POINT("WriteAmpBasedRateLimiter::Request");
+  TEST_SYNC_POINT_CALLBACK("WriteAmpBasedRateLimiter::Request:1",
+                           &rate_bytes_per_sec_);
+  if (auto_tuned_.load(std::memory_order_acquire) &&
+      (pri == Env::IO_HIGH || pri == Env::IO_USER) &&
+      duration_highpri_bytes_through_ + duration_bytes_through_ + bytes <=
+          max_bytes_per_sec_.load(std::memory_order_relaxed) * secs_per_tune_) {
+    // In the case where low-priority request is absent, actual time elapsed
+    // will be larger than secs_per_tune_, making the limit even tighter.
+    total_bytes_through_[pri] += bytes;
+    ++total_requests_[pri];
+    duration_highpri_bytes_through_ += bytes;
+    return;
+  }
+  assert(bytes <= refill_bytes_per_period_.load(std::memory_order_relaxed));
+  MutexLock g(&request_mutex_);
+
+  if (auto_tuned_.load(std::memory_order_acquire)) {
+    std::chrono::microseconds now(NowMicrosMonotonic(env_));
+    auto micros_per_tune = 1000 * 1000 * secs_per_tune_;
+    if (now - tuned_time_ >= std::chrono::microseconds(micros_per_tune)) {
+      Tune();
+    }
+  }
+
+  if (stop_) {
+    return;
+  }
+
+  ++total_requests_[pri];
+
+  if (available_bytes_ >= bytes) {
+    // Refill thread assigns quota and notifies requests waiting on
+    // the queue under mutex. So if we get here, that means nobody
+    // is waiting?
+    available_bytes_ -= bytes;
+    total_bytes_through_[pri] += bytes;
+    duration_bytes_through_ += bytes;
+    return;
+  }
+
+  // Request cannot be satisfied at this moment, enqueue
+  Req r(bytes, &request_mutex_);
+  queue_[pri].push_back(&r);
+
+  do {
+    bool timedout = false;
+    // Leader election, candidates can be:
+    // (1) a new incoming request,
+    // (2) a previous leader, whose quota has not been not assigned yet due
+    //     to lower priority
+    // (3) a previous waiter at the front of queue, who got notified by
+    //     previous leader
+    if (leader_ == nullptr && IsFrontOfOneQueue(&r)) {
+      leader_ = &r;
+      int64_t delta = next_refill_us_ - NowMicrosMonotonic(env_);
+      delta = delta > 0 ? delta : 0;
+      if (delta == 0) {
+        timedout = true;
+      } else {
+        int64_t wait_until = env_->NowMicros() + delta;
+        RecordTick(stats, NUMBER_RATE_LIMITER_DRAINS);
+        timedout = r.cv.TimedWait(wait_until);
+      }
+    } else {
+      // Not at the front of queue or an leader has already been elected
+      r.cv.Wait();
+    }
+
+    // request_mutex_ is held from now on
+    if (stop_) {
+      --requests_to_wait_;
+      exit_cv_.Signal();
+      return;
+    }
+
+    // Make sure the waken up request is always the header of its queue
+    assert(r.granted || IsFrontOfOneQueue(&r));
+    assert(leader_ == nullptr || IsFrontOfOneQueue(leader_));
+
+    if (leader_ == &r) {
+      // Waken up from TimedWait()
+      if (timedout) {
+        // Time to do refill!
+        Refill();
+
+        // Re-elect a new leader regardless. This is to simplify the
+        // election handling.
+        leader_ = nullptr;
+
+        // Notify the header of queue if current leader is going away
+        if (r.granted) {
+          // Current leader already got granted with quota. Notify header
+          // of waiting queue to participate next round of election.
+          for (auto i = 0; i < Env::IO_TOTAL; ++i) {
+            if (!queue_[i].empty()) {
+              assert(queue_[i].front() != &r);
+              queue_[i].front()->cv.Signal();
+            }
+          }
+          // Done
+          break;
+        }
+      } else {
+        // Spontaneous wake up, need to continue to wait
+        assert(!r.granted);
+        leader_ = nullptr;
+      }
+    } else {
+      // Waken up by previous leader:
+      // (1) if requested quota is granted, it is done.
+      // (2) if requested quota is not granted, this means current thread
+      // was picked as a new leader candidate (previous leader got quota).
+      // It needs to participate leader election because a new request may
+      // come in before this thread gets waken up. So it may actually need
+      // to do Wait() again.
+      assert(!timedout);
+    }
+  } while (!r.granted);
+}
+
+std::vector<Env::IOPriority>
+WriteAmpBasedRateLimiter::GeneratePriorityIterationOrderLocked() {
+  std::vector<Env::IOPriority> pri_iteration_order(Env::IO_TOTAL /* 4 */);
+  // We make Env::IO_USER a superior priority by always iterating its queue
+  // first
+  pri_iteration_order[0] = Env::IO_USER;
+
+  bool high_pri_iterated_after_mid_low_pri = rnd_.OneIn(fairness_);
+  TEST_SYNC_POINT_CALLBACK(
+      "WriteAmpBasedRateLimiter::GeneratePriorityIterationOrderLocked::"
+      "PostRandomOneInFairnessForHighPri",
+      &high_pri_iterated_after_mid_low_pri);
+  bool mid_pri_itereated_after_low_pri = rnd_.OneIn(fairness_);
+  TEST_SYNC_POINT_CALLBACK(
+      "WriteAmpBasedRateLimiter::GeneratePriorityIterationOrderLocked::"
+      "PostRandomOneInFairnessForMidPri",
+      &mid_pri_itereated_after_low_pri);
+
+  if (high_pri_iterated_after_mid_low_pri) {
+    pri_iteration_order[3] = Env::IO_HIGH;
+    pri_iteration_order[2] =
+        mid_pri_itereated_after_low_pri ? Env::IO_MID : Env::IO_LOW;
+    pri_iteration_order[1] =
+        (pri_iteration_order[2] == Env::IO_MID) ? Env::IO_LOW : Env::IO_MID;
+  } else {
+    pri_iteration_order[1] = Env::IO_HIGH;
+    pri_iteration_order[3] =
+        mid_pri_itereated_after_low_pri ? Env::IO_MID : Env::IO_LOW;
+    pri_iteration_order[2] =
+        (pri_iteration_order[3] == Env::IO_MID) ? Env::IO_LOW : Env::IO_MID;
+  }
+  TEST_SYNC_POINT_CALLBACK(
+      "WriteAmpBasedRateLimiter::GeneratePriorityIterationOrderLocked::"
+      "PreReturnPriIterationOrder",
+      &pri_iteration_order);
+  return pri_iteration_order;
+}
+
+void WriteAmpBasedRateLimiter::Refill() {
+  TEST_SYNC_POINT("WriteAmpBasedRateLimiter::Refill");
+  next_refill_us_ = NowMicrosMonotonic(env_) + refill_period_us_;
+  // Carry over the left over quota from the last period
+  auto refill_bytes_per_period =
+      refill_bytes_per_period_.load(std::memory_order_relaxed);
+  available_bytes_ = refill_bytes_per_period;
+
+  auto order = GeneratePriorityIterationOrderLocked();
+  for (auto pri : order) {
+    auto* queue = &queue_[pri];
+    while (!queue->empty()) {
+      auto* next_req = queue->front();
+      if (available_bytes_ < next_req->request_bytes) {
+        // avoid starvation
+        next_req->request_bytes -= available_bytes_;
+        available_bytes_ = 0;
+        break;
+      }
+      available_bytes_ -= next_req->request_bytes;
+      next_req->request_bytes = 0;
+      total_bytes_through_[pri] += next_req->bytes;
+      duration_bytes_through_ += next_req->bytes;
+      queue->pop_front();
+
+      next_req->granted = true;
+      if (next_req != leader_) {
+        // Quota granted, signal the thread
+        next_req->cv.Signal();
+      }
+    }
+  }
+}
+
+int64_t WriteAmpBasedRateLimiter::CalculateRefillBytesPerPeriod(
+    int64_t rate_bytes_per_sec) {
+  if (std::numeric_limits<int64_t>::max() / rate_bytes_per_sec <
+      refill_period_us_) {
+    // Avoid unexpected result in the overflow case. The result now is still
+    // inaccurate but is a number that is large enough.
+    return std::numeric_limits<int64_t>::max() / 1000000;
+  } else {
+    return std::max(kMinRefillBytesPerPeriod,
+                    rate_bytes_per_sec * refill_period_us_ / 1000000);
+  }
+}
+
+// The core function used to dynamically adjust the compaction rate limit,
+// called **at most** once every `secs_per_tune`.
+// I/O throughput threshold is automatically tuned based on history samples of
+// compaction and flush flow. This algorithm excels by taking into account the
+// limiter's inability to estimate the pressure of pending compactions, and the
+// possibility of foreground write fluctuation.
+Status WriteAmpBasedRateLimiter::Tune() {
+  // computed rate limit will be larger than 10MB/s
+  const int64_t kMinBytesPerSec = 10 << 20;
+  // high-priority bytes are padded to 8MB
+  const int64_t kHighBytesLower = 8 << 20;
+  // lower bound for write amplification estimation
+  const int kRatioLower = 10;
+  const int kPercentDeltaMax = 6;
+
+  std::chrono::microseconds prev_tuned_time = tuned_time_;
+  tuned_time_ = std::chrono::microseconds(NowMicrosMonotonic(env_));
+  auto duration = tuned_time_ - prev_tuned_time;
+  auto duration_ms =
+      std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+
+  int64_t prev_bytes_per_sec = GetBytesPerSecond();
+
+  // This function can be called less frequent than we anticipate when
+  // compaction rate is low. Loop through the actual time slice to correct
+  // the estimation.
+  auto millis_per_tune = 1000 * secs_per_tune_;
+  for (uint32_t i = 0; i < duration_ms / millis_per_tune; i++) {
+    bytes_sampler_.AddSample(duration_bytes_through_ * 1000 / duration_ms);
+    highpri_bytes_sampler_.AddSample(duration_highpri_bytes_through_ * 1000 /
+                                     duration_ms);
+    limit_bytes_sampler_.AddSample(prev_bytes_per_sec);
+  }
+  int64_t new_bytes_per_sec = bytes_sampler_.GetFullValue();
+  int32_t ratio = std::max(
+      kRatioLower,
+      static_cast<int32_t>(
+          bytes_sampler_.GetFullValue() * 10 /
+          std::max(highpri_bytes_sampler_.GetFullValue(), kHighBytesLower)));
+  // Only adjust threshold when foreground write (flush) flow increases,
+  // because decreasement could also be caused by manual flow control at
+  // application level to alleviate background pressure.
+  new_bytes_per_sec = std::max(
+      new_bytes_per_sec,
+      ratio *
+          std::max(highpri_bytes_sampler_.GetRecentValue(), kHighBytesLower) /
+          10);
+  // Set the threshold higher to avoid write stalls caused by pending
+  // compactions.
+  int64_t padding = CalculatePadding(new_bytes_per_sec);
+  // Adjustment based on utilization.
+  int64_t util = bytes_sampler_.GetRecentValue() * 1000 /
+                 limit_bytes_sampler_.GetRecentValue();
+  if (util >= 995) {
+    if (percent_delta_ < kPercentDeltaMax) {
+      percent_delta_ += 1;
+    }
+  } else if (percent_delta_ > 0) {
+    percent_delta_ -= 1;
+  }
+  // React to pace-up requests when LSM is out of shape.
+  if (critical_pace_up_.load(std::memory_order_relaxed)) {
+    percent_delta_ = 150;
+    critical_pace_up_.store(false, std::memory_order_relaxed);
+  } else if (normal_pace_up_.load(std::memory_order_relaxed)) {
+    percent_delta_ =
+        std::max(percent_delta_,
+                 static_cast<uint32_t>(padding * 150 / new_bytes_per_sec));
+    normal_pace_up_.store(false, std::memory_order_relaxed);
+  }
+  new_bytes_per_sec += padding + new_bytes_per_sec * percent_delta_ / 100;
+  new_bytes_per_sec =
+      std::max(kMinBytesPerSec,
+               std::min(new_bytes_per_sec,
+                        max_bytes_per_sec_.load(std::memory_order_relaxed) -
+                            highpri_bytes_sampler_.GetRecentValue()));
+  if (new_bytes_per_sec != prev_bytes_per_sec) {
+    SetActualBytesPerSecond(new_bytes_per_sec);
+  }
+
+  duration_bytes_through_ = 0;
+  duration_highpri_bytes_through_ = 0;
+  return Status::OK();
+}
+
+void WriteAmpBasedRateLimiter::PaceUp(bool critical) {
+  if (auto_tuned_.load(std::memory_order_acquire)) {
+    if (critical) {
+      critical_pace_up_.store(true, std::memory_order_relaxed);
+    } else {
+      normal_pace_up_.store(true, std::memory_order_relaxed);
+    }
+  }
+}
+
+RateLimiter* NewWriteAmpBasedRateLimiter(
+    int64_t rate_bytes_per_sec, int64_t refill_period_us /* = 100 * 1000 */,
+    int32_t fairness /* = 10 */,
+    RateLimiter::Mode mode /* = RateLimiter::Mode::kWritesOnly */,
+    bool auto_tuned /* = false */, int tune_per_sec /* = 1 */,
+    size_t smooth_window_size /* = 300 */,
+    size_t recent_window_size /* = 30 */) {
+  assert(rate_bytes_per_sec > 0);
+  assert(refill_period_us > 0);
+  assert(fairness > 0);
+  assert(tune_per_sec >= 0);
+  assert(smooth_window_size >= recent_window_size);
+  if (smooth_window_size == 0) {
+    smooth_window_size = 300;
+  }
+  if (recent_window_size == 0) {
+    recent_window_size = 30;
+  }
+  return new WriteAmpBasedRateLimiter(
+      rate_bytes_per_sec, refill_period_us, fairness, mode, Env::Default(),
+      auto_tuned, tune_per_sec, smooth_window_size, recent_window_size);
+}
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/rate_limiters/write_amp_based_rate_limiter.h b/utilities/rate_limiters/write_amp_based_rate_limiter.h
new file mode 100644
index 00000000000..7d1b8f2ea09
--- /dev/null
+++ b/utilities/rate_limiters/write_amp_based_rate_limiter.h
@@ -0,0 +1,172 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <deque>
+
+#include "port/port.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rate_limiter.h"
+#include "util/mutexlock.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class WriteAmpBasedRateLimiter : public RateLimiter {
+ public:
+  WriteAmpBasedRateLimiter(int64_t refill_bytes, int64_t refill_period_us,
+                           int32_t fairness, RateLimiter::Mode mode, Env* env,
+                           bool auto_tuned, int secs_per_tune,
+                           size_t auto_tune_smooth_window,
+                           size_t auto_tune_recent_window);
+
+  virtual ~WriteAmpBasedRateLimiter();
+
+  // This API allows user to dynamically change rate limiter's bytes per second.
+  // When auto-tuned is on, this sets rate limit's upper bound instead.
+  virtual void SetBytesPerSecond(int64_t bytes_per_second) override;
+
+  // Dynamically change rate limiter's auto_tuned mode.
+  virtual void SetAutoTuned(bool auto_tuned) override;
+
+  // Request for token to write bytes. If this request can not be satisfied,
+  // the call is blocked. Caller is responsible to make sure
+  // bytes <= GetSingleBurstBytes()
+  using RateLimiter::Request;
+  virtual void Request(const int64_t bytes, const Env::IOPriority pri,
+                       Statistics* stats) override;
+
+  virtual int64_t GetSingleBurstBytes() const override {
+    return refill_bytes_per_period_.load(std::memory_order_relaxed);
+  }
+
+  virtual int64_t GetTotalBytesThrough(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    MutexLock g(&request_mutex_);
+    if (pri == Env::IO_TOTAL) {
+      return total_bytes_through_[Env::IO_LOW] +
+             total_bytes_through_[Env::IO_MID] +
+             total_bytes_through_[Env::IO_HIGH] +
+             total_bytes_through_[Env::IO_USER];
+    }
+    return total_bytes_through_[pri];
+  }
+
+  virtual int64_t GetTotalRequests(
+      const Env::IOPriority pri = Env::IO_TOTAL) const override {
+    MutexLock g(&request_mutex_);
+    if (pri == Env::IO_TOTAL) {
+      return total_requests_[Env::IO_LOW] + total_requests_[Env::IO_MID] +
+             total_requests_[Env::IO_HIGH] + total_requests_[Env::IO_USER];
+    }
+    return total_requests_[pri];
+  }
+
+  virtual int64_t GetBytesPerSecond() const override {
+    return rate_bytes_per_sec_;
+  }
+
+  virtual bool GetAutoTuned() const override {
+    return auto_tuned_.load(std::memory_order_acquire);
+  }
+
+  virtual void PaceUp(bool critical) override;
+
+ private:
+  struct Req;
+  bool IsFrontOfOneQueue(Req* req);
+  void Refill();
+  int64_t CalculateRefillBytesPerPeriod(int64_t rate_bytes_per_sec);
+  void SetActualBytesPerSecond(int64_t bytes_per_second);
+  Status Tune();
+  std::vector<Env::IOPriority> GeneratePriorityIterationOrderLocked();
+
+  uint64_t NowMicrosMonotonic(Env* env) {
+    return env->NowNanos() / std::milli::den;
+  }
+
+  // This mutex guard all internal states
+  mutable port::Mutex request_mutex_;
+
+  const int64_t kMinRefillBytesPerPeriod = 100;
+
+  const int64_t refill_period_us_;
+
+  int64_t rate_bytes_per_sec_;
+  // This variable can be changed dynamically.
+  std::atomic<int64_t> refill_bytes_per_period_;
+  Env* const env_;
+
+  bool stop_;
+  port::CondVar exit_cv_;
+  int32_t requests_to_wait_;
+
+  int64_t total_requests_[Env::IO_TOTAL];
+  int64_t total_bytes_through_[Env::IO_TOTAL];
+  int64_t available_bytes_;
+  int64_t next_refill_us_;
+
+  int32_t fairness_;
+  Random rnd_;
+
+  struct Req;
+  Req* leader_;
+  std::deque<Req*> queue_[Env::IO_TOTAL];
+
+  // only used to synchronize auto_tuned setters
+  port::Mutex auto_tuned_mutex_;
+
+  std::atomic<bool> auto_tuned_;
+  int secs_per_tune_;
+  std::atomic<int64_t> max_bytes_per_sec_;
+  std::chrono::microseconds tuned_time_;
+  int64_t duration_highpri_bytes_through_;
+  int64_t duration_bytes_through_;
+
+  class WindowSmoother {
+   public:
+    WindowSmoother(size_t smooth_window_size, size_t recent_window_size)
+        : smooth_window_size_(smooth_window_size),
+          recent_window_size_(recent_window_size),
+          data_(smooth_window_size, 0) {}
+    void AddSample(int64_t v) {
+      auto recent_cursor =
+          (cursor_ + 1 + smooth_window_size_ - recent_window_size_) %
+          smooth_window_size_;
+      cursor_ = (cursor_ + 1) % smooth_window_size_;
+      full_sum_ += v - data_[cursor_];
+      recent_sum_ += v - data_[recent_cursor];
+      data_[cursor_] = v;
+    }
+    int64_t GetFullValue() { return full_sum_ / smooth_window_size_; }
+    int64_t GetRecentValue() { return recent_sum_ / recent_window_size_; }
+    bool AtTimePoint() const { return cursor_ == 0; }
+
+   private:
+    uint32_t cursor_{0};  // point to the most recent sample
+    size_t smooth_window_size_;
+    size_t recent_window_size_;
+    std::vector<size_t> data_;
+    int64_t full_sum_{0};
+    int64_t recent_sum_{0};
+  };
+
+  WindowSmoother bytes_sampler_;
+  WindowSmoother highpri_bytes_sampler_;
+  WindowSmoother limit_bytes_sampler_;
+  std::atomic<bool> critical_pace_up_;
+  std::atomic<bool> normal_pace_up_;
+  uint32_t percent_delta_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
diff --git a/utilities/rate_limiters/write_amp_based_rate_limiter_test.cc b/utilities/rate_limiters/write_amp_based_rate_limiter_test.cc
new file mode 100644
index 00000000000..1c29db69e6c
--- /dev/null
+++ b/utilities/rate_limiters/write_amp_based_rate_limiter_test.cc
@@ -0,0 +1,207 @@
+//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+//
+// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file. See the AUTHORS file for names of contributors.
+
+#include "utilities/rate_limiters/write_amp_based_rate_limiter.h"
+
+#include <chrono>
+#include <cinttypes>
+#include <limits>
+
+#include "db/db_test_util.h"
+#include "rocksdb/env.h"
+#include "rocksdb/rate_limiter.h"
+#include "test_util/sync_point.h"
+#include "test_util/testharness.h"
+#include "util/random.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// TODO(yhchiang): the rate will not be accurate when we run test in parallel.
+class WriteAmpBasedRateLimiterTest : public testing::Test {};
+
+TEST_F(WriteAmpBasedRateLimiterTest, OverflowRate) {
+  WriteAmpBasedRateLimiter limiter(std::numeric_limits<int64_t>::max(), 1000,
+                                   10, RateLimiter::Mode::kWritesOnly,
+                                   Env::Default(), false /* auto_tuned */, 1,
+                                   100, 10);
+  ASSERT_GT(limiter.GetSingleBurstBytes(), 1000000000ll);
+}
+
+TEST_F(WriteAmpBasedRateLimiterTest, StartStop) {
+  std::unique_ptr<RateLimiter> limiter(
+      NewWriteAmpBasedRateLimiter(100, 100, 10));
+}
+
+TEST_F(WriteAmpBasedRateLimiterTest, Modes) {
+  for (auto mode : {RateLimiter::Mode::kWritesOnly,
+                    RateLimiter::Mode::kReadsOnly, RateLimiter::Mode::kAllIo}) {
+    WriteAmpBasedRateLimiter limiter(
+        2000 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+        10 /* fairness */, mode, Env::Default(), false /* auto_tuned */,
+        1 /* secs_per_tune */, 100 /* smooth_window */, 10 /* recent_window */);
+    limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
+                    RateLimiter::OpType::kRead);
+    if (mode == RateLimiter::Mode::kWritesOnly) {
+      ASSERT_EQ(0, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    } else {
+      ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    }
+
+    limiter.Request(1000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
+                    RateLimiter::OpType::kWrite);
+    if (mode == RateLimiter::Mode::kAllIo) {
+      ASSERT_EQ(2000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    } else {
+      ASSERT_EQ(1000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+    }
+  }
+}
+
+TEST_F(WriteAmpBasedRateLimiterTest, AutoTune) {
+  auto* thread_env = Env::Default();
+  WriteAmpBasedRateLimiter limiter(
+      10000 /* rate_bytes_per_sec */, 1000 * 1000 /* refill_period_us */,
+      10 /* fairness */, RateLimiter::Mode::kAllIo, Env::Default(),
+      true /* auto_tuned */, 1 /* secs_per_tune */, 100 /* smooth_window */,
+      10 /* recent_window */);
+  limiter.Request(8000 /* bytes */, Env::IO_HIGH, nullptr /* stats */,
+                  RateLimiter::OpType::kWrite);
+  ASSERT_EQ(8000, limiter.GetTotalBytesThrough(Env::IO_HIGH));
+
+  thread_env->SleepForMicroseconds(1000 * 1000);
+  // request from low io can trigger auto tune.
+  limiter.Request(1000 /* bytes */, Env::IO_LOW, nullptr /* stats */,
+                  RateLimiter::OpType::kWrite);
+  ASSERT_EQ(10485760, limiter.GetBytesPerSecond());
+  // TODO: add more logic for auto-tune
+}
+
+TEST_F(WriteAmpBasedRateLimiterTest, Rate) {
+  auto* env = Env::Default();
+  struct Arg {
+    Arg(int32_t _target_rate, int _burst)
+        : limiter(NewWriteAmpBasedRateLimiter(_target_rate, 100 * 1000, 10)),
+          request_size(_target_rate / 10),
+          burst(_burst) {}
+    std::unique_ptr<RateLimiter> limiter;
+    int32_t request_size;
+    int burst;
+  };
+
+  auto writer = [](void* p) {
+    auto* thread_env = Env::Default();
+    auto* arg = static_cast<Arg*>(p);
+    // Test for 2 seconds
+    auto until = thread_env->NowMicros() + 2 * 1000000;
+    Random r((uint32_t)(thread_env->NowNanos() %
+                        std::numeric_limits<uint32_t>::max()));
+    while (thread_env->NowMicros() < until) {
+      for (int i = 0; i < static_cast<int>(r.Skewed(arg->burst) + 1); ++i) {
+        arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1,
+                              Env::IO_HIGH, nullptr /* stats */,
+                              RateLimiter::OpType::kWrite);
+      }
+      arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_LOW,
+                            nullptr /* stats */, RateLimiter::OpType::kWrite);
+    }
+  };
+
+  for (int i = 1; i <= 16; i *= 2) {
+    int32_t target = i * 1024 * 10;
+    Arg arg(target, i / 4 + 1);
+    int64_t old_total_bytes_through = 0;
+    for (int iter = 1; iter <= 2; ++iter) {
+      // second iteration changes the target dynamically
+      if (iter == 2) {
+        target *= 2;
+        arg.limiter->SetBytesPerSecond(target);
+      }
+      auto start = env->NowMicros();
+      for (int t = 0; t < i; ++t) {
+        env->StartThread(writer, &arg);
+      }
+      env->WaitForJoin();
+
+      auto elapsed = env->NowMicros() - start;
+      double rate =
+          (arg.limiter->GetTotalBytesThrough() - old_total_bytes_through) *
+          1000000.0 / elapsed;
+      old_total_bytes_through = arg.limiter->GetTotalBytesThrough();
+      fprintf(stderr,
+              "request size [1 - %" PRIi32 "], limit %" PRIi32
+              " KB/sec, actual rate: %lf KB/sec, elapsed %.2lf seconds\n",
+              arg.request_size - 1, target / 1024, rate / 1024,
+              elapsed / 1000000.0);
+
+      ASSERT_GE(rate / target, 0.75);
+      ASSERT_LE(rate / target, 1.25);
+    }
+  }
+}
+
+TEST_F(WriteAmpBasedRateLimiterTest, LimitChangeTest) {
+  // starvation test when limit changes to a smaller value
+  int64_t refill_period = 1000 * 1000;
+  auto* env = Env::Default();
+  rocksdb::SyncPoint::GetInstance()->EnableProcessing();
+  struct Arg {
+    Arg(int32_t _request_size, Env::IOPriority _pri,
+        std::shared_ptr<RateLimiter> _limiter)
+        : request_size(_request_size), pri(_pri), limiter(_limiter) {}
+    int32_t request_size;
+    Env::IOPriority pri;
+    std::shared_ptr<RateLimiter> limiter;
+  };
+
+  auto writer = [](void* p) {
+    auto* arg = static_cast<Arg*>(p);
+    arg->limiter->Request(arg->request_size, arg->pri, nullptr /* stats */,
+                          RateLimiter::OpType::kWrite);
+  };
+
+  for (uint32_t i = 1; i <= 16; i <<= 1) {
+    int32_t target = i * 1024 * 10;
+    // refill per second
+    for (int iter = 0; iter < 2; iter++) {
+      std::shared_ptr<RateLimiter> limiter =
+          std::make_shared<WriteAmpBasedRateLimiter>(
+              target, refill_period, 10, RateLimiter::Mode::kWritesOnly,
+              Env::Default(), false /* auto_tuned */, 1, 300, 30);
+      rocksdb::SyncPoint::GetInstance()->LoadDependency(
+          {{"WriteAmpBasedRateLimiter::Request",
+            "WriteAmpBasedRateLimiterTest::LimitChangeTest:changeLimitStart"},
+           {"WriteAmpBasedRateLimiterTest::LimitChangeTest:changeLimitEnd",
+            "WriteAmpBasedRateLimiter::Refill"}});
+      Arg arg(target, Env::IO_HIGH, limiter);
+      // The idea behind is to start a request first, then before it refills,
+      // update limit to a different value (2X/0.5X). No starvation should
+      // be guaranteed under any situation
+      // TODO(lightmark): more test cases are welcome.
+      env->StartThread(writer, &arg);
+      int32_t new_limit = (target << 1) >> (iter << 1);
+      TEST_SYNC_POINT(
+          "WriteAmpBasedRateLimiterTest::LimitChangeTest:changeLimitStart");
+      arg.limiter->SetBytesPerSecond(new_limit);
+      TEST_SYNC_POINT(
+          "WriteAmpBasedRateLimiterTest::LimitChangeTest:changeLimitEnd");
+      env->WaitForJoin();
+      fprintf(stderr,
+              "[COMPLETE] request size %" PRIi32 " KB, new limit %" PRIi32
+              "KB/sec, refill period %" PRIi64 " ms\n",
+              target / 1024, new_limit / 1024, refill_period / 1000);
+    }
+  }
+}
+
+}  // namespace ROCKSDB_NAMESPACE
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}