From bdb8dda732a5b2d044287f85db0571442d0a5775 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 26 Mar 2019 20:56:26 +0100 Subject: [PATCH 1/2] Automatize the reuse of the CUDA stream of an input product I believe this approach is the best we can do in a simple (and local) way. --- .../Common/interface/CUDAProductBase.h | 34 ++++++++++++++++++- HeterogeneousCore/CUDACore/README.md | 28 +++++++++------ .../CUDACore/interface/CUDAScopedContext.h | 10 ++---- .../CUDACore/src/CUDAScopedContext.cc | 14 ++++++++ HeterogeneousCore/CUDACore/test/BuildFile.xml | 5 +++ .../CUDACore/test/test_CUDAScopedContext.cc | 11 ++++-- HeterogeneousCore/CUDACore/test/test_main.cc | 27 +++++++++++++++ 7 files changed, 106 insertions(+), 23 deletions(-) diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h index f54b1c0548ef4..797a364f77516 100644 --- a/CUDADataFormats/Common/interface/CUDAProductBase.h +++ b/CUDADataFormats/Common/interface/CUDAProductBase.h @@ -1,6 +1,7 @@ #ifndef CUDADataFormats_Common_CUDAProductBase_h #define CUDADataFormats_Common_CUDAProductBase_h +#include #include #include @@ -13,6 +14,20 @@ class CUDAProductBase { public: CUDAProductBase() = default; // Needed only for ROOT dictionary generation + CUDAProductBase(CUDAProductBase&& other): + stream_{std::move(other.stream_)}, + event_{std::move(other.event_)}, + mayReuseStream_{other.mayReuseStream_.load()}, + device_{other.device_} + {} + CUDAProductBase& operator=(CUDAProductBase&& other) { + stream_ = std::move(other.stream_); + event_ = std::move(other.event_); + mayReuseStream_ = other.mayReuseStream_.load(); + device_ = other.device_; + return *this; + } + bool isValid() const { return stream_.get() != nullptr; } bool isAvailable() const; @@ -20,7 +35,6 @@ class CUDAProductBase { const cuda::stream_t<>& stream() const { return *stream_; } cuda::stream_t<>& stream() { return *stream_; } - const std::shared_ptr>& streamPtr() const { return stream_; } const cuda::event_t *event() const { return event_.get(); } cuda::event_t *event() { return event_.get(); } @@ -29,12 +43,30 @@ class CUDAProductBase { explicit CUDAProductBase(int device, std::shared_ptr> stream, std::shared_ptr event); private: + friend class CUDAScopedContext; + + // Intended to be used only from CUDAScopedContext + const std::shared_ptr>& streamPtr() const { return stream_; } + + bool mayReuseStream() const { + bool expected = true; + bool changed = mayReuseStream_.compare_exchange_strong(expected, false); + // If the current thread is the one flipping the flag, it may + // reuse the stream. + return changed; + } + // The cuda::stream_t is really shared among edm::Event products, so // using shared_ptr also here std::shared_ptr> stream_; //! // shared_ptr because of caching in CUDAService std::shared_ptr event_; //! + // This flag tellswhether the CUDA stream may be reused by a + // consumer or not. The goal is to have a "chain" of modules to + // queue their work to the same stream. + mutable std::atomic mayReuseStream_ = true; //! + int device_ = -1; //! }; diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md index 3a31ae5f267ec..4b9f9532ae0bd 100644 --- a/HeterogeneousCore/CUDACore/README.md +++ b/HeterogeneousCore/CUDACore/README.md @@ -202,15 +202,9 @@ void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetu CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); // Set the current device to the same that was used to produce - // InputData, and also use the same CUDA stream + // InputData, and possibly use the same CUDA stream CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - // Alternatively a new CUDA stream can be created here. This is for - // a case where there are two (or more) consumers of - // CUDAProduct whose work is independent and thus can be run - // in parallel. - CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder); - // Grab the real input data. Checks that the input data is on the // current device. If the input data was produced in a different CUDA // stream than the CUDAScopedContext holds, create an inter-stream @@ -240,6 +234,12 @@ void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) { } ``` +See [further below](#setting-the-current-device) for the conditions +when the `CUDAScopedContext` constructor reuses the CUDA stream. Note +that the `CUDAScopedContext` constructor taking `edm::StreamID` is +allowed, it will just always create a new CUDA stream. + + ### Producer with CUDA input and output (with ExternalWork) ```cpp @@ -319,8 +319,8 @@ void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); // Set the current device to the same that was used to produce - // InputData, and also use the same CUDA stream - CUDAScopedContext ctx{streamID}; + // InputData, and possibly use the same CUDA stream + CUDAScopedContext ctx{inputDataWrapped}; // Grab the real input data. Checks that the input data is on the // current device. If the input data was produced in a different CUDA @@ -368,7 +368,7 @@ void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetu CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); // Set the current device to the same that was used to produce - // InputData, and also use the same CUDA stream + // InputData, and possibly use the same CUDA stream CUDAScopedContext ctx{inputDataWrapped}; // Alternatively a new CUDA stream can be created here. This is for @@ -540,7 +540,13 @@ CUDAScopedContext ctx{cclus}; `CUDAScopedContext` works in the RAII way and does the following * Sets the current device for the current scope - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream - - If constructed from the `CUDAProduct`, uses the same device and CUDA stream as was used to produce the `CUDAProduct` + - If constructed from the `CUDAProduct`, uses the same device and possibly the same CUDA stream as was used to produce the `CUDAProduct` + * The CUDA stream is reused if this producer is the first consumer + of the `CUDAProduct`, otherwise a new CUDA stream is created. + This approach is simple compromise to automatically express the work of + parallel producers in different CUDA streams, and at the same + time allow a chain of producers to queue their work to the same + CUDA stream. * Gives access to the CUDA stream the algorithm should use to queue asynchronous work * Calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary * [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams) diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h index ca6af7ef40d0c..8668e1fca6196 100644 --- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h +++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h @@ -34,12 +34,7 @@ class CUDAScopedContext { stream_(std::move(token.streamPtr())) {} - template - explicit CUDAScopedContext(const CUDAProduct& data): - currentDevice_(data.device()), - setDeviceForThisScope_(currentDevice_), - stream_(data.streamPtr()) - {} + explicit CUDAScopedContext(const CUDAProductBase& data); explicit CUDAScopedContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder): CUDAScopedContext(streamID) @@ -47,8 +42,7 @@ class CUDAScopedContext { waitingTaskHolder_ = std::move(waitingTaskHolder); } - template - explicit CUDAScopedContext(const CUDAProduct& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder): + explicit CUDAScopedContext(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder): CUDAScopedContext(data) { waitingTaskHolder_ = std::move(waitingTaskHolder); diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc index f46ee660b448d..8168954c4b9c8 100644 --- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc @@ -16,6 +16,20 @@ CUDAScopedContext::CUDAScopedContext(edm::StreamID streamID): stream_ = cs->getCUDAStream(); } +CUDAScopedContext::CUDAScopedContext(const CUDAProductBase& data): + currentDevice_(data.device()), + setDeviceForThisScope_(currentDevice_) +{ + if(data.mayReuseStream()) { + stream_ = data.streamPtr(); + } + else { + edm::Service cs; + stream_ = cs->getCUDAStream(); + } +} + + CUDAScopedContext::CUDAScopedContext(int device, std::unique_ptr> stream, std::unique_ptr event): currentDevice_(device), setDeviceForThisScope_(device), diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml index d4bcf721b4af5..2d1b30a26b93a 100644 --- a/HeterogeneousCore/CUDACore/test/BuildFile.xml +++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml @@ -1,6 +1,11 @@ + + + + + diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc index 8b67c26490362..809ff69f0fe00 100644 --- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc @@ -1,6 +1,8 @@ #include "catch.hpp" #include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" @@ -59,6 +61,11 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { CUDAScopedContext ctx2{data}; REQUIRE(cuda::device::current::get().id() == data.device()); REQUIRE(ctx2.stream().id() == data.stream().id()); + + // Second use of a product should lead to new stream + CUDAScopedContext ctx3{data}; + REQUIRE(cuda::device::current::get().id() == data.device()); + REQUIRE(ctx3.stream().id() != data.stream().id()); } SECTION("Storing state as CUDAContextToken") { @@ -118,9 +125,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { } } - // Destroy and clean up all resources so that the next test can - // assume to start from a clean state. cudaCheck(cudaSetDevice(defaultDevice)); cudaCheck(cudaDeviceSynchronize()); - cudaDeviceReset(); + // Note: CUDA resources are cleaned up by CUDAService destructor } diff --git a/HeterogeneousCore/CUDACore/test/test_main.cc b/HeterogeneousCore/CUDACore/test/test_main.cc index 0c7c351f437f5..5bde9d9cc6b67 100644 --- a/HeterogeneousCore/CUDACore/test/test_main.cc +++ b/HeterogeneousCore/CUDACore/test/test_main.cc @@ -1,2 +1,29 @@ #define CATCH_CONFIG_MAIN #include "catch.hpp" + +#include "FWCore/PluginManager/interface/standard.h" +#include "FWCore/PluginManager/interface/PluginManager.h" +#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h" + +class ServiceRegistryListener: public Catch::TestEventListenerBase { +public: + using Catch::TestEventListenerBase::TestEventListenerBase; // inherit constructor + + void testRunStarting(Catch::TestRunInfo const& testRunInfo) override { + edmplugin::PluginManager::configure(edmplugin::standard::config()); + + const std::string config{ +R"_(import FWCore.ParameterSet.Config as cms +process = cms.Process('Test') +process.CUDAService = cms.Service('CUDAService') +)_" + }; + + edm::ServiceToken tempToken = edm::ServiceRegistry::createServicesFromConfig(config); + operate_.reset(new edm::ServiceRegistry::Operate(tempToken)); + } + +private: + std::unique_ptr operate_; +}; +CATCH_REGISTER_LISTENER(ServiceRegistryListener); From cf491ae586e4deed58d2af93bf24805a9c4b67b6 Mon Sep 17 00:00:00 2001 From: Andrea Bocci Date: Mon, 8 Apr 2019 14:49:27 +0200 Subject: [PATCH 2/2] Fix comment --- CUDADataFormats/Common/interface/CUDAProductBase.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h index 797a364f77516..ef767d8635251 100644 --- a/CUDADataFormats/Common/interface/CUDAProductBase.h +++ b/CUDADataFormats/Common/interface/CUDAProductBase.h @@ -62,11 +62,12 @@ class CUDAProductBase { // shared_ptr because of caching in CUDAService std::shared_ptr event_; //! - // This flag tellswhether the CUDA stream may be reused by a + // This flag tells whether the CUDA stream may be reused by a // consumer or not. The goal is to have a "chain" of modules to // queue their work to the same stream. mutable std::atomic mayReuseStream_ = true; //! + // The CUDA device associated with this product int device_ = -1; //! };