From bdb8dda732a5b2d044287f85db0571442d0a5775 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 26 Mar 2019 20:56:26 +0100
Subject: [PATCH 1/2] Automatize the reuse of the CUDA stream of an input
 product

I believe this approach is the best we can do in a simple (and local)
way.
---
 .../Common/interface/CUDAProductBase.h        | 34 ++++++++++++++++++-
 HeterogeneousCore/CUDACore/README.md          | 28 +++++++++------
 .../CUDACore/interface/CUDAScopedContext.h    | 10 ++----
 .../CUDACore/src/CUDAScopedContext.cc         | 14 ++++++++
 HeterogeneousCore/CUDACore/test/BuildFile.xml |  5 +++
 .../CUDACore/test/test_CUDAScopedContext.cc   | 11 ++++--
 HeterogeneousCore/CUDACore/test/test_main.cc  | 27 +++++++++++++++
 7 files changed, 106 insertions(+), 23 deletions(-)
diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
index f54b1c0548ef4..797a364f77516 100644
--- a/CUDADataFormats/Common/interface/CUDAProductBase.h
+++ b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -1,6 +1,7 @@
 #ifndef CUDADataFormats_Common_CUDAProductBase_h
 #define CUDADataFormats_Common_CUDAProductBase_h
 
+#include <atomic>
 #include <memory>
 
 #include <cuda/api_wrappers.h>
@@ -13,6 +14,20 @@ class CUDAProductBase {
 public:
   CUDAProductBase() = default; // Needed only for ROOT dictionary generation
 
+  CUDAProductBase(CUDAProductBase&& other):
+    stream_{std::move(other.stream_)},
+    event_{std::move(other.event_)},
+    mayReuseStream_{other.mayReuseStream_.load()},
+    device_{other.device_}
+  {}
+  CUDAProductBase& operator=(CUDAProductBase&& other) {
+    stream_ = std::move(other.stream_);
+    event_ = std::move(other.event_);
+    mayReuseStream_ = other.mayReuseStream_.load();
+    device_ = other.device_;
+    return *this;
+  }
+
   bool isValid() const { return stream_.get() != nullptr; }
   bool isAvailable() const;
 
@@ -20,7 +35,6 @@ class CUDAProductBase {
 
   const cuda::stream_t<>& stream() const { return *stream_; }
   cuda::stream_t<>& stream() { return *stream_; }
-  const std::shared_ptr<cuda::stream_t<>>& streamPtr() const { return stream_; }
 
   const cuda::event_t *event() const { return event_.get(); }
   cuda::event_t *event() { return event_.get(); }
@@ -29,12 +43,30 @@ class CUDAProductBase {
   explicit CUDAProductBase(int device, std::shared_ptr<cuda::stream_t<>> stream, std::shared_ptr<cuda::event_t> event);
 
 private:
+  friend class CUDAScopedContext;
+
+  // Intended to be used only from CUDAScopedContext
+  const std::shared_ptr<cuda::stream_t<>>& streamPtr() const { return stream_; }
+
+  bool mayReuseStream() const {
+    bool expected = true;
+    bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
+    // If the current thread is the one flipping the flag, it may
+    // reuse the stream.
+    return changed;
+  }
+
   // The cuda::stream_t is really shared among edm::Event products, so
   // using shared_ptr also here
   std::shared_ptr<cuda::stream_t<>> stream_; //!
   // shared_ptr because of caching in CUDAService
   std::shared_ptr<cuda::event_t> event_; //!
 
+  // This flag tellswhether the CUDA stream may be reused by a
+  // consumer or not. The goal is to have a "chain" of modules to
+  // queue their work to the same stream.
+  mutable std::atomic<bool> mayReuseStream_ = true; //!
+
   int device_ = -1; //!
 };
 
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
index 3a31ae5f267ec..4b9f9532ae0bd 100644
--- a/HeterogeneousCore/CUDACore/README.md
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -202,15 +202,9 @@ void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetu
   CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
 
   // Set the current device to the same that was used to produce
-  // InputData, and also use the same CUDA stream
+  // InputData, and possibly use the same CUDA stream
   CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)};
 
-  // Alternatively a new CUDA stream can be created here. This is for
-  // a case where there are two (or more) consumers of
-  // CUDAProduct<InputData> whose work is independent and thus can be run
-  // in parallel.
-  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder);
-  
   // Grab the real input data. Checks that the input data is on the
   // current device. If the input data was produced in a different CUDA
   // stream than the CUDAScopedContext holds, create an inter-stream
@@ -240,6 +234,12 @@ void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
 }
 ```
 
+See [further below](#setting-the-current-device) for the conditions
+when the `CUDAScopedContext` constructor reuses the CUDA stream. Note
+that the `CUDAScopedContext` constructor taking `edm::StreamID` is
+allowed, it will just always create a new CUDA stream.
+
+
 ### Producer with CUDA input and output (with ExternalWork)
 
 ```cpp
@@ -319,8 +319,8 @@ void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
   CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
 
   // Set the current device to the same that was used to produce
-  // InputData, and also use the same CUDA stream
-  CUDAScopedContext ctx{streamID};
+  // InputData, and possibly use the same CUDA stream
+  CUDAScopedContext ctx{inputDataWrapped};
 
   // Grab the real input data. Checks that the input data is on the
   // current device. If the input data was produced in a different CUDA
@@ -368,7 +368,7 @@ void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetu
   CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
 
   // Set the current device to the same that was used to produce
-  // InputData, and also use the same CUDA stream
+  // InputData, and possibly use the same CUDA stream
   CUDAScopedContext ctx{inputDataWrapped};
 
   // Alternatively a new CUDA stream can be created here. This is for
@@ -540,7 +540,13 @@ CUDAScopedContext ctx{cclus};
 `CUDAScopedContext` works in the RAII way and does the following
 * Sets the current device for the current scope
   - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream
-  - If constructed from the `CUDAProduct<T>`, uses the same device and CUDA stream as was used to produce the `CUDAProduct<T>`
+  - If constructed from the `CUDAProduct<T>`, uses the same device and possibly the same CUDA stream as was used to produce the `CUDAProduct<T>`
+    * The CUDA stream is reused if this producer is the first consumer
+      of the `CUDAProduct<T>`, otherwise a new CUDA stream is created.
+      This approach is simple compromise to automatically express the work of
+      parallel producers in different CUDA streams, and at the same
+      time allow a chain of producers to queue their work to the same
+      CUDA stream.
 * Gives access to the CUDA stream the algorithm should use to queue asynchronous work
 * Calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary
 * [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams)
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index ca6af7ef40d0c..8668e1fca6196 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -34,12 +34,7 @@ class CUDAScopedContext {
     stream_(std::move(token.streamPtr()))
   {}
 
-  template<typename T>
-  explicit CUDAScopedContext(const CUDAProduct<T>& data):
-    currentDevice_(data.device()),
-    setDeviceForThisScope_(currentDevice_),
-    stream_(data.streamPtr())
-  {}
+  explicit CUDAScopedContext(const CUDAProductBase& data);
 
   explicit CUDAScopedContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
     CUDAScopedContext(streamID)
@@ -47,8 +42,7 @@ class CUDAScopedContext {
     waitingTaskHolder_ = std::move(waitingTaskHolder);
   }
 
-  template <typename T>
-  explicit CUDAScopedContext(const CUDAProduct<T>& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
+  explicit CUDAScopedContext(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
     CUDAScopedContext(data)
   {
     waitingTaskHolder_ = std::move(waitingTaskHolder);
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
index f46ee660b448d..8168954c4b9c8 100644
--- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -16,6 +16,20 @@ CUDAScopedContext::CUDAScopedContext(edm::StreamID streamID):
   stream_ = cs->getCUDAStream();
 }
 
+CUDAScopedContext::CUDAScopedContext(const CUDAProductBase& data):
+  currentDevice_(data.device()),
+  setDeviceForThisScope_(currentDevice_)
+{
+  if(data.mayReuseStream()) {
+    stream_ = data.streamPtr();
+  }
+  else {
+    edm::Service<CUDAService> cs;
+    stream_ = cs->getCUDAStream();
+  }
+}
+
+
 CUDAScopedContext::CUDAScopedContext(int device, std::unique_ptr<cuda::stream_t<>> stream, std::unique_ptr<cuda::event_t> event):
   currentDevice_(device),
   setDeviceForThisScope_(device),
diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml
index d4bcf721b4af5..2d1b30a26b93a 100644
--- a/HeterogeneousCore/CUDACore/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml
@@ -1,6 +1,11 @@
 <bin file="test_*.cc test_*.cu" name="testHeterogeneousCoreCUDACore">
+  <use name="CUDADataFormats/Common"/>
   <use name="FWCore/TestProcessor"/>
+  <use name="FWCore/ParameterSet"/>
+  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/ServiceRegistry"/>
   <use name="HeterogeneousCore/CUDACore"/>
+  <use name="HeterogeneousCore/CUDAServices"/>
   <use name="catch2"/>
   <use name="cuda"/>
 </bin>
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index 8b67c26490362..809ff69f0fe00 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -1,6 +1,8 @@
 #include "catch.hpp"
 
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
@@ -59,6 +61,11 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       CUDAScopedContext ctx2{data};
       REQUIRE(cuda::device::current::get().id() == data.device());
       REQUIRE(ctx2.stream().id() == data.stream().id());
+
+      // Second use of a product should lead to new stream
+      CUDAScopedContext ctx3{data};
+      REQUIRE(cuda::device::current::get().id() == data.device());
+      REQUIRE(ctx3.stream().id() != data.stream().id());
     }
 
     SECTION("Storing state as CUDAContextToken") {
@@ -118,9 +125,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
     }
   }
 
-  // Destroy and clean up all resources so that the next test can
-  // assume to start from a clean state.
   cudaCheck(cudaSetDevice(defaultDevice));
   cudaCheck(cudaDeviceSynchronize());
-  cudaDeviceReset();
+  // Note: CUDA resources are cleaned up by CUDAService destructor
 }
diff --git a/HeterogeneousCore/CUDACore/test/test_main.cc b/HeterogeneousCore/CUDACore/test/test_main.cc
index 0c7c351f437f5..5bde9d9cc6b67 100644
--- a/HeterogeneousCore/CUDACore/test/test_main.cc
+++ b/HeterogeneousCore/CUDACore/test/test_main.cc
@@ -1,2 +1,29 @@
 #define CATCH_CONFIG_MAIN
 #include "catch.hpp"
+
+#include "FWCore/PluginManager/interface/standard.h"
+#include "FWCore/PluginManager/interface/PluginManager.h"
+#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h"
+
+class ServiceRegistryListener: public Catch::TestEventListenerBase {
+public:
+  using Catch::TestEventListenerBase::TestEventListenerBase; // inherit constructor
+
+  void testRunStarting(Catch::TestRunInfo const& testRunInfo) override {
+    edmplugin::PluginManager::configure(edmplugin::standard::config());
+
+    const std::string config{
+R"_(import FWCore.ParameterSet.Config as cms
+process = cms.Process('Test')
+process.CUDAService = cms.Service('CUDAService')
+)_"
+  };
+
+    edm::ServiceToken tempToken = edm::ServiceRegistry::createServicesFromConfig(config);
+    operate_.reset(new edm::ServiceRegistry::Operate(tempToken));
+  }
+
+private:
+  std::unique_ptr<edm::ServiceRegistry::Operate> operate_;
+};
+CATCH_REGISTER_LISTENER(ServiceRegistryListener);

From cf491ae586e4deed58d2af93bf24805a9c4b67b6 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Mon, 8 Apr 2019 14:49:27 +0200
Subject: [PATCH 2/2] Fix comment

---
 CUDADataFormats/Common/interface/CUDAProductBase.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
index 797a364f77516..ef767d8635251 100644
--- a/CUDADataFormats/Common/interface/CUDAProductBase.h
+++ b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -62,11 +62,12 @@ class CUDAProductBase {
   // shared_ptr because of caching in CUDAService
   std::shared_ptr<cuda::event_t> event_; //!
 
-  // This flag tellswhether the CUDA stream may be reused by a
+  // This flag tells whether the CUDA stream may be reused by a
   // consumer or not. The goal is to have a "chain" of modules to
   // queue their work to the same stream.
   mutable std::atomic<bool> mayReuseStream_ = true; //!
 
+  // The CUDA device associated with this product
   int device_ = -1; //!
 };