From 4abd97cdd80a67588023f9a7385236953dea5195 Mon Sep 17 00:00:00 2001 From: Roman Arzumanyan Date: Thu, 16 Sep 2021 00:08:00 +0300 Subject: [PATCH] CudaBuffer class added for simple 1D CUDA memory allocation --- PyNvCodec/TC/inc/MemoryInterfaces.hpp | 30 +++++++ PyNvCodec/TC/inc/Tasks.hpp | 39 +++++++++ PyNvCodec/TC/src/MemoryInterfaces.cpp | 68 ++++++++++++++- PyNvCodec/TC/src/Tasks.cpp | 117 ++++++++++++++++++++++++++ PyNvCodec/inc/PyNvCodec.hpp | 35 ++++++++ PyNvCodec/src/PyNvCodec.cpp | 105 +++++++++++++++++++++++ 6 files changed, 392 insertions(+), 2 deletions(-) diff --git a/PyNvCodec/TC/inc/MemoryInterfaces.hpp b/PyNvCodec/TC/inc/MemoryInterfaces.hpp index cd602b19..b28f735c 100644 --- a/PyNvCodec/TC/inc/MemoryInterfaces.hpp +++ b/PyNvCodec/TC/inc/MemoryInterfaces.hpp @@ -100,6 +100,36 @@ class DllExport Buffer final : public Token { #endif }; +class DllExport CudaBuffer final : public Token { +public: + CudaBuffer() = delete; + CudaBuffer(const CudaBuffer &other) = delete; + CudaBuffer &operator=(CudaBuffer &other) = delete; + + static CudaBuffer *Make(size_t elemSize, size_t numElems, CUcontext context); + CudaBuffer *Clone(); + + size_t GetRawMemSize() const { return elem_size * num_elems; } + size_t GetNumElems() const { return num_elems; } + size_t GetElemSize() const { return elem_size; } + CUdeviceptr GpuMem() { return gpuMem; } + ~CudaBuffer(); + +private: + CudaBuffer(size_t elemSize, size_t numElems, CUcontext context); + bool Allocate(); + void Deallocate(); + + CUdeviceptr gpuMem = 0UL; + CUcontext ctx = nullptr; + size_t elem_size = 0U; + size_t num_elems = 0U; + +#ifdef TRACK_TOKEN_ALLOCATIONS + uint64_t id = 0U; +#endif +}; + /* RAII-style CUDA Context (un)lock; */ class DllExport CudaCtxPush final { diff --git a/PyNvCodec/TC/inc/Tasks.hpp b/PyNvCodec/TC/inc/Tasks.hpp index b06255f2..8fa43cd6 100644 --- a/PyNvCodec/TC/inc/Tasks.hpp +++ b/PyNvCodec/TC/inc/Tasks.hpp @@ -143,6 +143,26 @@ class DllExport CudaUploadFrame final : public Task { struct CudaUploadFrame_Impl *pImpl = nullptr; }; +class DllExport UploadBuffer final : public Task { +public: + UploadBuffer() = delete; + UploadBuffer(const UploadBuffer &other) = delete; + UploadBuffer &operator=(const UploadBuffer &other) = delete; + + TaskExecStatus Run() final; + size_t GetUploadSize() const; + ~UploadBuffer() final; + static UploadBuffer *Make(CUstream cuStream, CUcontext cuContext, + uint32_t elem_size, uint32_t num_elems); + +private: + UploadBuffer(CUstream cuStream, CUcontext cuContext, + uint32_t elem_size, uint32_t num_elems); + static const uint32_t numInputs = 1U; + static const uint32_t numOutputs = 1U; + struct UploadBuffer_Impl *pImpl = nullptr; +}; + class DllExport CudaDownloadSurface final : public Task { public: CudaDownloadSurface() = delete; @@ -163,6 +183,25 @@ class DllExport CudaDownloadSurface final : public Task { struct CudaDownloadSurface_Impl *pImpl = nullptr; }; +class DllExport DownloadCudaBuffer final : public Task { +public: + DownloadCudaBuffer() = delete; + DownloadCudaBuffer(const DownloadCudaBuffer &other) = delete; + DownloadCudaBuffer &operator=(const DownloadCudaBuffer &other) = delete; + + ~DownloadCudaBuffer() final; + TaskExecStatus Run() final; + static DownloadCudaBuffer *Make(CUstream cuStream, CUcontext cuContext, + uint32_t elem_size, uint32_t num_elems); + +private: + DownloadCudaBuffer(CUstream cuStream, CUcontext cuContext, + uint32_t elem_size, uint32_t num_elems); + static const uint32_t numInputs = 1U; + static const uint32_t numOutputs = 1U; + struct DownloadCudaBuffer_Impl *pImpl = nullptr; +}; + class DllExport DemuxFrame final : public Task { public: DemuxFrame() = delete; diff --git a/PyNvCodec/TC/src/MemoryInterfaces.cpp b/PyNvCodec/TC/src/MemoryInterfaces.cpp index 063f847b..ef15afa7 100644 --- a/PyNvCodec/TC/src/MemoryInterfaces.cpp +++ b/PyNvCodec/TC/src/MemoryInterfaces.cpp @@ -77,11 +77,12 @@ struct AllocRegister { } }; -AllocRegister BuffersRegister, HWSurfaceRegister; +AllocRegister BuffersRegister, HWSurfaceRegister, CudaBuffersRegiser; bool CheckAllocationCounters() { auto numLeakedBuffers = BuffersRegister.GetSize(); auto numLeakedSurfaces = HWSurfaceRegister.GetSize(); + auto numLeakedCudaBuffers = CudaBuffersRegiser.GetSize(); if (numLeakedBuffers) { cerr << "Leaked buffers (id : size): " << endl; @@ -99,7 +100,15 @@ bool CheckAllocationCounters() { } } - return (0U == numLeakedBuffers) && (0U == numLeakedSurfaces); + if (numLeakedCudaBuffers) { + cerr << "Leaked CUDA buffers (id : size): " << endl; + for (auto i = 0; i < numLeakedCudaBuffers; i++) { + auto pNote = CudaBuffersRegiser.GetNoteByIndex(i); + cerr << "\t" << pNote->id << "\t: " << pNote->size << endl; + } + } + + return (0U == numLeakedBuffers) && (0U == numLeakedSurfaces) && (0U == numLeakedCudaBuffers); } } // namespace VPF @@ -261,6 +270,61 @@ Buffer *Buffer::MakeOwnMem(size_t bufferSize, const void *pCopyFrom, return new Buffer(bufferSize, pCopyFrom, ctx); } +CudaBuffer* CudaBuffer::Make(size_t elemSize, size_t numElems, CUcontext context) { + return new CudaBuffer(elemSize, numElems, context); +} + +CudaBuffer *CudaBuffer::Clone() { + auto pCopy = CudaBuffer::Make(elem_size, num_elems, ctx); + + if (CUDA_SUCCESS != cuMemcpyDtoD(pCopy->GpuMem(), GpuMem(), GetRawMemSize())) { + delete pCopy; + return nullptr; + } + + return pCopy; +} + +CudaBuffer::~CudaBuffer() { + Deallocate(); +} + +CudaBuffer::CudaBuffer(size_t elemSize, size_t numElems, CUcontext context) { + elem_size = elemSize; + num_elems = numElems; + ctx = context; + + if (!Allocate()) { + throw bad_alloc(); + } +} + +bool CudaBuffer::Allocate() { + if (GetRawMemSize()) { + CudaCtxPush lock(ctx); + auto res = cuMemAlloc(&gpuMem, GetRawMemSize()); + ThrowOnCudaError(res, __LINE__); + + if (0U != gpuMem) { +#ifdef TRACK_TOKEN_ALLOCATIONS + id = CudaBuffersRegiser.AddNote(GetRawMemSize()); +#endif + return true; + } + } + return false; +} + +void CudaBuffer::Deallocate() { + ThrowOnCudaError(cuMemFree(gpuMem), __LINE__); + gpuMem = 0U; + +#ifdef TRACK_TOKEN_ALLOCATIONS + AllocInfo info(id, GetRawMemSize()); + CudaBuffersRegiser.DeleteNote(info); +#endif +} + SurfacePlane::SurfacePlane() = default; SurfacePlane &SurfacePlane::operator=(const SurfacePlane &other) { diff --git a/PyNvCodec/TC/src/Tasks.cpp b/PyNvCodec/TC/src/Tasks.cpp index ac3d3c00..369668b6 100644 --- a/PyNvCodec/TC/src/Tasks.cpp +++ b/PyNvCodec/TC/src/Tasks.cpp @@ -492,6 +492,65 @@ TaskExecStatus CudaUploadFrame::Run() { return TASK_EXEC_SUCCESS; } +namespace VPF { +struct UploadBuffer_Impl { + CUstream cuStream; + CUcontext cuContext; + CudaBuffer *pBuffer = nullptr; + + UploadBuffer_Impl() = delete; + UploadBuffer_Impl(const UploadBuffer_Impl &other) = delete; + UploadBuffer_Impl &operator=(const UploadBuffer_Impl &other) = delete; + + UploadBuffer_Impl(CUstream stream, CUcontext context, + uint32_t elem_size, uint32_t num_elems) + : cuStream(stream), cuContext(context) { + pBuffer = CudaBuffer::Make(elem_size, num_elems, context); + } + + ~UploadBuffer_Impl() { delete pBuffer; } +}; +} // namespace VPF + +UploadBuffer *UploadBuffer::Make(CUstream cuStream, CUcontext cuContext, + uint32_t elem_size, uint32_t num_elems) { + return new UploadBuffer(cuStream, cuContext, elem_size, num_elems); +} + +UploadBuffer::UploadBuffer(CUstream cuStream, CUcontext cuContext, + uint32_t elem_size, uint32_t num_elems) + : + + Task("UploadBuffer", UploadBuffer::numInputs, + UploadBuffer::numOutputs, cuda_stream_sync, (void *)cuStream) { + pImpl = new UploadBuffer_Impl(cuStream, cuContext, elem_size, num_elems); +} + +UploadBuffer::~UploadBuffer() { delete pImpl; } + +TaskExecStatus UploadBuffer::Run() { + NvtxMark tick(__FUNCTION__); + if (!GetInput()) { + return TASK_EXEC_FAIL; + } + + ClearOutputs(); + + auto stream = pImpl->cuStream; + auto context = pImpl->cuContext; + auto pBuffer = pImpl->pBuffer; + auto pSrcHost = ((Buffer *)GetInput())->GetDataAs(); + + CudaCtxPush lock(context); + if (CUDA_SUCCESS != cuMemcpyHtoDAsync(pBuffer->GpuMem(), (const void *)pSrcHost, + pBuffer->GetRawMemSize(), stream)) { + return TASK_EXEC_FAIL; + } + + SetOutput(pBuffer, 0); + return TASK_EXEC_SUCCESS; +} + namespace VPF { struct CudaDownloadSurface_Impl { CUstream cuStream; @@ -528,6 +587,25 @@ struct CudaDownloadSurface_Impl { ~CudaDownloadSurface_Impl() { delete pHostFrame; } }; + +struct DownloadCudaBuffer_Impl { + CUstream cuStream; + CUcontext cuContext; + Buffer *pHostBuffer = nullptr; + + DownloadCudaBuffer_Impl() = delete; + DownloadCudaBuffer_Impl(const DownloadCudaBuffer_Impl &other) = delete; + DownloadCudaBuffer_Impl & + operator=(const DownloadCudaBuffer_Impl &other) = delete; + + DownloadCudaBuffer_Impl(CUstream stream, CUcontext context, uint32_t elem_size, + uint32_t num_elems) + : cuStream(stream), cuContext(context) { + pHostBuffer = Buffer::MakeOwnMem(elem_size * num_elems, context); + } + + ~DownloadCudaBuffer_Impl() { delete pHostBuffer; } +}; } // namespace VPF CudaDownloadSurface *CudaDownloadSurface::Make(CUstream cuStream, @@ -591,6 +669,45 @@ TaskExecStatus CudaDownloadSurface::Run() { return TASK_EXEC_SUCCESS; } +DownloadCudaBuffer *DownloadCudaBuffer::Make(CUstream cuStream, CUcontext cuContext, + uint32_t elem_size, uint32_t num_elems) { + return new DownloadCudaBuffer(cuStream, cuContext, elem_size, num_elems); +} + +DownloadCudaBuffer::DownloadCudaBuffer(CUstream cuStream, CUcontext cuContext, + uint32_t elem_size, uint32_t num_elems) : + Task("DownloadCudaBuffer", DownloadCudaBuffer::numInputs, + DownloadCudaBuffer::numOutputs, cuda_stream_sync, + (void *)cuStream) { + pImpl = new DownloadCudaBuffer_Impl(cuStream, cuContext, elem_size, num_elems); +} + +DownloadCudaBuffer::~DownloadCudaBuffer() { delete pImpl; } + +TaskExecStatus DownloadCudaBuffer::Run() { + NvtxMark tick(__FUNCTION__); + + if (!GetInput()) { + return TASK_EXEC_FAIL; + } + + ClearOutputs(); + + auto stream = pImpl->cuStream; + auto context = pImpl->cuContext; + auto pCudaBuffer = (CudaBuffer *)GetInput(); + auto pDstHost = ((Buffer *)pImpl->pHostBuffer)->GetDataAs(); + + CudaCtxPush lock(context); + if (CUDA_SUCCESS != cuMemcpyDtoHAsync(pDstHost, pCudaBuffer->GpuMem(), + pCudaBuffer->GetRawMemSize(), stream)) { + return TASK_EXEC_FAIL; + } + + SetOutput(pImpl->pHostBuffer, 0); + return TASK_EXEC_SUCCESS; +} + namespace VPF { struct DemuxFrame_Impl { size_t videoBytes = 0U; diff --git a/PyNvCodec/inc/PyNvCodec.hpp b/PyNvCodec/inc/PyNvCodec.hpp index a7994b19..45f6c2d1 100644 --- a/PyNvCodec/inc/PyNvCodec.hpp +++ b/PyNvCodec/inc/PyNvCodec.hpp @@ -80,6 +80,23 @@ class PyFrameUploader { std::shared_ptr UploadSingleFrame(py::array_t &frame); }; +class PyBufferUploader { + std::unique_ptr uploader; + uint32_t elem_size, num_elems; + +public: + PyBufferUploader(uint32_t elemSize, uint32_t numElems, uint32_t gpu_ID); + + PyBufferUploader(uint32_t elemSize, uint32_t numElems, CUcontext ctx, + CUstream str); + + PyBufferUploader(uint32_t elemSize, uint32_t numElems, + size_t ctx, size_t str) : + PyBufferUploader(elemSize, numElems, (CUcontext)ctx, (CUstream)str) {} + + std::shared_ptr UploadSingleBuffer(py::array_t &buffer); +}; + class PySurfaceDownloader { std::unique_ptr upDownloader; uint32_t surfaceWidth, surfaceHeight; @@ -102,6 +119,24 @@ class PySurfaceDownloader { py::array_t &frame); }; +class PyCudaBufferDownloader { + std::unique_ptr upDownloader; + uint32_t elem_size, num_elems; + +public: + PyCudaBufferDownloader(uint32_t elemSize, uint32_t numElems, uint32_t gpu_ID); + + PyCudaBufferDownloader(uint32_t elemSize, uint32_t numElems, CUcontext ctx, + CUstream str); + + PyCudaBufferDownloader(uint32_t elemSize, uint32_t numElems, + size_t ctx, size_t str) : + PyCudaBufferDownloader(elemSize, numElems, (CUcontext)ctx, (CUstream)str) {} + + bool DownloadSingleCudaBuffer(std::shared_ptr buffer, + py::array_t &np_array); +}; + class PySurfaceConverter { std::unique_ptr upConverter; std::unique_ptr upCtxBuffer; diff --git a/PyNvCodec/src/PyNvCodec.cpp b/PyNvCodec/src/PyNvCodec.cpp index 5e17a871..b52ee344 100644 --- a/PyNvCodec/src/PyNvCodec.cpp +++ b/PyNvCodec/src/PyNvCodec.cpp @@ -216,6 +216,44 @@ PyFrameUploader::UploadSingleFrame(py::array_t &frame) { return shared_ptr(pSurface->Clone()); } +PyBufferUploader::PyBufferUploader(uint32_t elemSize, uint32_t numElems, + uint32_t gpu_ID) +{ + elem_size = elemSize; + num_elems = numElems; + + uploader.reset(UploadBuffer::Make(CudaResMgr::Instance().GetStream(gpu_ID), + CudaResMgr::Instance().GetCtx(gpu_ID), + elem_size, num_elems)); +} + +PyBufferUploader::PyBufferUploader(uint32_t elemSize, uint32_t numElems, + CUcontext ctx, CUstream str) +{ + elem_size = elemSize; + num_elems = numElems; + + uploader.reset(UploadBuffer::Make(str, ctx, elem_size, num_elems)); +} + +shared_ptr +PyBufferUploader::UploadSingleBuffer(py::array_t &frame) +{ + auto pRawBuf = Buffer::Make(frame.size(), frame.mutable_data()); + uploader->SetInput(pRawBuf, 0U); + auto res = uploader->Execute(); + delete pRawBuf; + + if (TASK_EXEC_FAIL == res) + throw runtime_error("Error uploading frame to GPU"); + + auto pCudaBuffer = (CudaBuffer *)uploader->GetOutput(0U); + if (!pCudaBuffer) + throw runtime_error("Error uploading frame to GPU"); + + return shared_ptr(pCudaBuffer->Clone()); +} + PySurfaceDownloader::PySurfaceDownloader(uint32_t width, uint32_t height, Pixel_Format format, uint32_t gpu_ID) { surfaceWidth = width; @@ -262,6 +300,52 @@ bool PySurfaceDownloader::DownloadSingleSurface(shared_ptr surface, return false; } +PyCudaBufferDownloader::PyCudaBufferDownloader(uint32_t elemSize, + uint32_t numElems, uint32_t gpu_ID) +{ + elem_size = elemSize; + num_elems = numElems; + + upDownloader.reset( + DownloadCudaBuffer::Make(CudaResMgr::Instance().GetStream(gpu_ID), + CudaResMgr::Instance().GetCtx(gpu_ID), + elem_size, num_elems)); +} + +PyCudaBufferDownloader::PyCudaBufferDownloader(uint32_t elemSize, uint32_t numElems, + CUcontext ctx, CUstream str) +{ + elem_size = elemSize; + num_elems = numElems; + + upDownloader.reset(DownloadCudaBuffer::Make(str, ctx, elem_size, num_elems)); +} + +bool PyCudaBufferDownloader::DownloadSingleCudaBuffer(std::shared_ptr buffer, + py::array_t &np_array) +{ + upDownloader->SetInput(buffer.get(), 0U); + if (TASK_EXEC_FAIL == upDownloader->Execute()) + { + return false; + } + + auto *pRawBuf = (Buffer *)upDownloader->GetOutput(0U); + if (pRawBuf) + { + auto const downloadSize = pRawBuf->GetRawMemSize(); + if (downloadSize != np_array.size()) + { + np_array.resize({downloadSize}, false); + } + + memcpy(np_array.mutable_data(), pRawBuf->GetRawMemPtr(), downloadSize); + return true; + } + + return false; +} + PySurfaceConverter::PySurfaceConverter(uint32_t width, uint32_t height, Pixel_Format inFormat, Pixel_Format outFormat, uint32_t gpuID) @@ -1696,6 +1780,13 @@ PYBIND11_MODULE(PyNvCodec, m) .def_readwrite("color_space", &ColorspaceConversionContext::color_space) .def_readwrite("color_range", &ColorspaceConversionContext::color_range); + py::class_>(m, "CudaBuffer") + .def("GetRawMemSize", &CudaBuffer::GetRawMemSize) + .def("GetNumElems", &CudaBuffer::GetNumElems) + .def("GetElemSize", &CudaBuffer::GetElemSize) + .def("GpuMem", &CudaBuffer::GpuMem) + .def("Clone", &CudaBuffer::Clone, py::return_value_policy::take_ownership); + py::class_>(m, "SurfacePlane") .def("Width", &SurfacePlane::Width) .def("Height", &SurfacePlane::Height) @@ -2079,6 +2170,13 @@ PYBIND11_MODULE(PyNvCodec, m) py::return_value_policy::take_ownership, py::call_guard()); + py::class_(m, "PyBufferUploader") + .def(py::init()) + .def(py::init()) + .def("UploadSingleBuffer", &PyBufferUploader::UploadSingleBuffer, + py::return_value_policy::take_ownership, + py::call_guard()); + py::class_(m, "PySurfaceDownloader") .def(py::init()) .def(py::init()) @@ -2087,6 +2185,13 @@ PYBIND11_MODULE(PyNvCodec, m) &PySurfaceDownloader::DownloadSingleSurface, py::call_guard()); + py::class_(m, "PyCudaBufferDownloader") + .def(py::init()) + .def(py::init()) + .def("DownloadSingleCudaBuffer", + &PyCudaBufferDownloader::DownloadSingleCudaBuffer, + py::call_guard()); + py::class_(m, "PySurfaceConverter") .def(py::init()) .def(py::init())