From 3d44c5f8fcd553b1b5c1cf03a3d653792f27bce1 Mon Sep 17 00:00:00 2001 From: Debadri Samaddar Date: Wed, 4 Dec 2024 15:28:04 +0530 Subject: [PATCH 1/3] [GPU] Global Buffer manager and optimization Implemented global Buffers Optimized pipeline due to reduced buffer creation steps Modifed command queue and Buffer wrappers Signed-off-by: Debadri Samaddar --- nntrainer/cl_buffer_manager.cpp | 41 +++++++++++ nntrainer/cl_buffer_manager.h | 72 +++++++++++++++++++ nntrainer/meson.build | 2 + nntrainer/opencl/opencl_buffer.cpp | 30 +++++++- nntrainer/opencl/opencl_buffer.h | 34 ++++++++- .../opencl/opencl_command_queue_manager.cpp | 71 ++++++++++++++++++ .../opencl/opencl_command_queue_manager.h | 30 ++++++++ nntrainer/opencl/opencl_loader.cpp | 4 ++ nntrainer/opencl/opencl_loader.h | 20 ++++++ .../tensor/cl_operations/blas_kernels.cpp | 30 ++++---- nntrainer/tensor/cl_operations/blas_kernels.h | 3 + 11 files changed, 319 insertions(+), 18 deletions(-) create mode 100644 nntrainer/cl_buffer_manager.cpp create mode 100644 nntrainer/cl_buffer_manager.h diff --git a/nntrainer/cl_buffer_manager.cpp b/nntrainer/cl_buffer_manager.cpp new file mode 100644 index 0000000000..c2af78ca54 --- /dev/null +++ b/nntrainer/cl_buffer_manager.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Debadri Samaddar + * + * @file cl_buffer_manager.cpp + * @date 01 Dec 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Debadri Samaddar + * @bug No known bugs except for NYI items + * @brief This file contains global Buffer objects and manages them + */ + +#include + +namespace nntrainer { + +ClBufferManager &ClBufferManager::getInstance() { + static ClBufferManager instance; + return instance; +} + +// to-do: Implementation to be updated with array of Buffer objects if required +// fp16 Buffer objects to be added in future +ClBufferManager::ClBufferManager() { + readBufferA = new opencl::Buffer(context_inst_, buffer_size_bytes, true); + readBufferB = new opencl::Buffer(context_inst_, buffer_size_bytes, true); + readBufferC = new opencl::Buffer(context_inst_, buffer_size_bytes, true); + writeBufferA = new opencl::Buffer(context_inst_, buffer_size_bytes, false); + writeBufferB = new opencl::Buffer(context_inst_, buffer_size_bytes, false); + ml_logi("ClBufferManager: Buffers initialized"); +} + +ClBufferManager::~ClBufferManager() { + delete readBufferA; + delete readBufferB; + delete readBufferC; + delete writeBufferA; + delete writeBufferB; +} + +} // namespace nntrainer diff --git a/nntrainer/cl_buffer_manager.h b/nntrainer/cl_buffer_manager.h new file mode 100644 index 0000000000..875e7e9bc0 --- /dev/null +++ b/nntrainer/cl_buffer_manager.h @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Debadri Samaddar + * + * @file cl_buffer_manager.h + * @date 01 Dec 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Debadri Samaddar + * @bug No known bugs except for NYI items + * @brief This file contains global Buffer objects and manages them + */ + +#ifndef __CL_BUFFER_MANAGER_H__ +#define __CL_BUFFER_MANAGER_H__ + +#include + +#include +#include + +#include + +namespace nntrainer { + +/** + * @class ClBufferManager contains Buffer object management + * @brief Support for Buffer management + */ + +class ClBufferManager { + +private: + /** + * @brief Private constructor to prevent object creation + * + */ + ClBufferManager(); + + /** + * @brief OpenCl context global instance + * + */ + opencl::ContextManager &context_inst_ = opencl::ContextManager::GetInstance(); + + /** + * @brief Buffer size in bytes preset (256 mebibytes) + */ + size_t buffer_size_bytes = 8192 * 8192 * sizeof(float); + +public: + /** + * @brief Get Global ClBufferManager. + * + * @return ClBufferManager& + */ + static ClBufferManager &getInstance(); + + opencl::Buffer *readBufferA; + opencl::Buffer *readBufferB; + opencl::Buffer *readBufferC; + opencl::Buffer *writeBufferA; + opencl::Buffer *writeBufferB; + + /** + * @brief Destroy Buffer pointers. + * + */ + ~ClBufferManager(); +}; +} // namespace nntrainer + +#endif /* __CL_BUFFER_MANAGER_H__ */ diff --git a/nntrainer/meson.build b/nntrainer/meson.build index 2fa705a756..ed15b8f2a7 100644 --- a/nntrainer/meson.build +++ b/nntrainer/meson.build @@ -64,7 +64,9 @@ nntrainer_common_sources = [ if get_option('enable-opencl') nntrainer_headers += meson.current_source_dir() / 'cl_context.h' + nntrainer_headers += meson.current_source_dir() / 'cl_buffer_manager.h' nntrainer_common_sources += 'cl_context.cpp' + nntrainer_common_sources += 'cl_buffer_manager.cpp' endif foreach s : nntrainer_common_sources diff --git a/nntrainer/opencl/opencl_buffer.cpp b/nntrainer/opencl/opencl_buffer.cpp index 1614f3e622..07ae213a94 100644 --- a/nntrainer/opencl/opencl_buffer.cpp +++ b/nntrainer/opencl/opencl_buffer.cpp @@ -27,7 +27,7 @@ namespace nntrainer::opencl { * @param read_only flag * @param data data for the buffer */ -Buffer::Buffer(ContextManager &context_manager, int size_in_bytes, +Buffer::Buffer(ContextManager &context_manager, size_t size_in_bytes, bool read_only, void *data) { cl_context context = context_manager.GetContext(); cl_mem_flags flags = read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE; @@ -94,6 +94,20 @@ bool Buffer::WriteData(CommandQueueManager &command_queue_inst, return command_queue_inst.EnqueueWriteBuffer(mem_buf_, size_, data); } +bool Buffer::WriteDataRegion(CommandQueueManager &command_queue_inst, + size_t size_in_bytes, const void *data, + size_t host_origin_offset, + size_t buffer_origin_offset) { + if (size_in_bytes > size_) { + ml_loge("Failed to write buffer region. Region size(%lu bytes) greater " + "than buffer size(%lu bytes).", + size_in_bytes, size_); + return false; + } + return command_queue_inst.EnqueueWriteBufferRegion( + mem_buf_, size_in_bytes, data, host_origin_offset, buffer_origin_offset); +} + /** * @brief reading data from the buffer * @@ -105,6 +119,20 @@ bool Buffer::ReadData(CommandQueueManager &command_queue_inst, void *data) { return command_queue_inst.EnqueueReadBuffer(mem_buf_, size_, data); } +bool Buffer::ReadDataRegion(CommandQueueManager &command_queue_inst, + size_t size_in_bytes, void *data, + size_t host_origin_offset, + size_t buffer_origin_offset) { + if (size_in_bytes > size_) { + ml_loge("Failed to read from buffer region. Region size(%lu bytes) greater " + "than buffer size(%lu bytes).", + size_in_bytes, size_); + return false; + } + return command_queue_inst.EnqueueReadBufferRegion( + mem_buf_, size_in_bytes, data, host_origin_offset, buffer_origin_offset); +} + void *Buffer::MapBuffer(CommandQueueManager &command_queue_inst, size_t offset_in_bytes, size_t size_in_bytes, bool read_only, bool async) { diff --git a/nntrainer/opencl/opencl_buffer.h b/nntrainer/opencl/opencl_buffer.h index 031efe5cbf..beddb30339 100644 --- a/nntrainer/opencl/opencl_buffer.h +++ b/nntrainer/opencl/opencl_buffer.h @@ -54,8 +54,8 @@ class Buffer { * @param read_only flag * @param data data for the buffer */ - Buffer(ContextManager &context_manager, int size_in_bytes, bool read_only, - void *data); + Buffer(ContextManager &context_manager, size_t size_in_bytes, bool read_only, + void *data = nullptr); /** * @brief Move constructor for buffer by deleting the previous buffer @@ -106,6 +106,21 @@ class Buffer { */ bool WriteData(CommandQueueManager &command_queue_inst, const void *data); + /** + * @brief writing data to a buffer region + * + * @param command_queue_inst reference of command queue instance + * @param size_in_bytes size of region + * @param data pointer of region + * @param host_origin_offset offset in the host memory region + * @param buffer_origin_offset offset in the buffer memory region + * @return true if successful write or false otherwise + */ + bool WriteDataRegion(CommandQueueManager &command_queue_inst, + size_t size_in_bytes, const void *data, + size_t host_origin_offset = 0, + size_t buffer_origin_offset = 0); + /** * @brief reading data from the buffer * @@ -115,6 +130,21 @@ class Buffer { */ bool ReadData(CommandQueueManager &command_queue_inst, void *data); + /** + * @brief Reading data from a buffer region + * + * @param command_queue_inst reference of command queue instance + * @param size_in_bytes size of region + * @param data pointer of region + * @param host_origin_offset offset in the host memory region + * @param buffer_origin_offset offset in the buffer memory region + * @return true if successful write or false otherwise + */ + bool ReadDataRegion(CommandQueueManager &command_queue_inst, + size_t size_in_bytes, void *data, + size_t host_origin_offset = 0, + size_t buffer_origin_offset = 0); + /** * @brief Mapping buffer to host memory * diff --git a/nntrainer/opencl/opencl_command_queue_manager.cpp b/nntrainer/opencl/opencl_command_queue_manager.cpp index e1bffa9f28..78b32537c1 100644 --- a/nntrainer/opencl/opencl_command_queue_manager.cpp +++ b/nntrainer/opencl/opencl_command_queue_manager.cpp @@ -131,6 +131,41 @@ bool CommandQueueManager::EnqueueReadBuffer(cl_mem buffer, size_t size_in_bytes, return true; } +bool CommandQueueManager::EnqueueReadBufferRegion( + cl_mem buffer, size_t size_in_bytes, void *data, size_t host_origin_offset, + size_t buffer_origin_offset, bool async) { + + // managing synchronization + const cl_bool blocking = async ? CL_FALSE : CL_TRUE; + + // (x, y, z) offset in the memory region associated with buffer + const size_t buffer_origin[] = {buffer_origin_offset, 0, 0}; + // (x, y, z) offset in the memory region associated with host + const size_t host_origin[] = {host_origin_offset, 0, 0}; + // region defines the (width in bytes, height in rows, depth in slices) + const size_t region[] = {size_in_bytes, 1, 1}; + // length of each row in bytes + size_t row_pitch = region[0]; + // length of each 2D slice in bytes + size_t slice_pitch = region[0] * region[1]; + + // Buffer and host data are interpreted as 1D in this case + // hence row and slice pitch are same for both + cl_int error_code = clEnqueueReadBufferRect( + command_queue_, buffer, blocking, buffer_origin, host_origin, region, + row_pitch, slice_pitch, row_pitch, slice_pitch, data, 0, nullptr, nullptr); + + if (error_code != CL_SUCCESS) { + ml_loge("Failed to write data region to GPU (clEnqueueWriteBufferRect). " + "OpenCL error " + "code: %d", + error_code); + return false; + } + + return true; +} + /** * @brief Writing buffer object. Used from Buffer class * @@ -150,6 +185,7 @@ bool CommandQueueManager::EnqueueWriteBuffer(cl_mem buffer, auto error_code = clEnqueueWriteBuffer(command_queue_, buffer, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr); + if (error_code != CL_SUCCESS) { ml_loge("Failed to upload data to GPU (clEnqueueWriteBuffer). OpenCL error " "code: %d", @@ -160,6 +196,41 @@ bool CommandQueueManager::EnqueueWriteBuffer(cl_mem buffer, return true; } +bool CommandQueueManager::EnqueueWriteBufferRegion( + cl_mem buffer, size_t size_in_bytes, const void *data, + size_t host_origin_offset, size_t buffer_origin_offset, bool async) { + + // managing synchronization + const cl_bool blocking = async ? CL_FALSE : CL_TRUE; + + // (x, y, z) offset in the memory region associated with buffer + const size_t buffer_origin[] = {buffer_origin_offset, 0, 0}; + // (x, y, z) offset in the memory region associated with host + const size_t host_origin[] = {host_origin_offset, 0, 0}; + // region defines the (width in bytes, height in rows, depth in slices) + const size_t region[] = {size_in_bytes, 1, 1}; + // length of each row in bytes + size_t row_pitch = region[0]; + // length of each 2D slice in bytes + size_t slice_pitch = region[0] * region[1]; + + // Buffer and host data are interpreted as 1D in this case + // hence row and slice pitch are same for both + cl_int error_code = clEnqueueWriteBufferRect( + command_queue_, buffer, blocking, buffer_origin, host_origin, region, + row_pitch, slice_pitch, row_pitch, slice_pitch, data, 0, nullptr, nullptr); + + if (error_code != CL_SUCCESS) { + ml_loge("Failed to write data region to GPU (clEnqueueWriteBufferRect). " + "OpenCL error " + "code: %d", + error_code); + return false; + } + + return true; +} + /** * @brief Mapping a region of a buffer object into the host address space * diff --git a/nntrainer/opencl/opencl_command_queue_manager.h b/nntrainer/opencl/opencl_command_queue_manager.h index 8f9965e654..7047f4a2c6 100644 --- a/nntrainer/opencl/opencl_command_queue_manager.h +++ b/nntrainer/opencl/opencl_command_queue_manager.h @@ -73,6 +73,22 @@ class CommandQueueManager { bool EnqueueReadBuffer(cl_mem buffer, size_t size_in_bytes, void *data, bool async = false); + /** + * @brief Reading 1D region from a buffer object. Used from Buffer class + * + * @param buffer cl_mem buffer object + * @param size_in_bytes size of data region + * @param data pointer for the region + * @param host_origin_offset offset in the host memory region + * @param buffer_origin_offset offset in the buffer memory region + * @param async flag for asynchronous operation + * @return true if reading is successful or false otherwise + */ + bool EnqueueReadBufferRegion(cl_mem buffer, size_t size_in_bytes, void *data, + size_t host_origin_offset = 0, + size_t buffer_origin_offset = 0, + bool async = false); + /** * @brief Writing buffer object. Used from Buffer class * @@ -85,6 +101,20 @@ class CommandQueueManager { bool EnqueueWriteBuffer(cl_mem buffer, size_t size_in_bytes, const void *data, bool async = false); + /** + * @brief Writing 1D region of a buffer object. Used from Buffer class + * + * @param buffer cl_mem buffer object + * @param size_in_bytes size of data region + * @param data pointer for the region + * @param origin_offset offset in the memory region + * @param async flag for asynchronous operation + * @return true if writing is successful or false otherwise + */ + bool EnqueueWriteBufferRegion(cl_mem buffer, size_t size_in_bytes, + const void *data, size_t host_origin_offset = 0, + size_t buffer_origin_offset = 0, + bool async = false); /** * @brief Mapping a region of a buffer object into the host address space * diff --git a/nntrainer/opencl/opencl_loader.cpp b/nntrainer/opencl/opencl_loader.cpp index e3cf3e73c2..8b39eace80 100644 --- a/nntrainer/opencl/opencl_loader.cpp +++ b/nntrainer/opencl/opencl_loader.cpp @@ -75,6 +75,8 @@ void LoadOpenCLFunctions(void *libopencl) { LoadFunction(clEnqueueReadBuffer); LoadFunction(clEnqueueMapBuffer); LoadFunction(clEnqueueUnmapMemObject); + LoadFunction(clEnqueueWriteBufferRect); + LoadFunction(clEnqueueReadBufferRect); LoadFunction(clCreateProgramWithSource); LoadFunction(clCreateProgramWithBinary); LoadFunction(clBuildProgram); @@ -102,6 +104,8 @@ PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer; PFN_clEnqueueReadBuffer clEnqueueReadBuffer; PFN_clEnqueueMapBuffer clEnqueueMapBuffer; PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject; +PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect; +PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect; PFN_clCreateProgramWithSource clCreateProgramWithSource; PFN_clCreateProgramWithBinary clCreateProgramWithBinary; PFN_clBuildProgram clBuildProgram; diff --git a/nntrainer/opencl/opencl_loader.h b/nntrainer/opencl/opencl_loader.h index 0aa2a5cfd6..20cdef58fb 100644 --- a/nntrainer/opencl/opencl_loader.h +++ b/nntrainer/opencl/opencl_loader.h @@ -87,6 +87,24 @@ typedef cl_int(CL_API_CALL *PFN_clEnqueueUnmapMemObject)( const cl_event * /**< event_wait_list */, cl_event * /**< event */ ); +typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBufferRect)( + cl_command_queue /**< command_queue */, cl_mem /**< buffer */, + cl_bool /**< blocking_write */, const size_t * /**< buffer_offset */, + const size_t * /**< host_offset */, const size_t * /**< region */, + size_t /**< buffer_row_pitch */, size_t /**< buffer_slice_pitch */, + size_t /**< host_row_pitch */, size_t /**< host_slice_pitch */, + const void * /**< ptr */, cl_uint /**< num_events_in_wait_list */, + const cl_event * /**< event_wait_list */, cl_event * /**< event */); + +typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBufferRect)( + cl_command_queue /**< command_queue */, cl_mem /**< buffer */, + cl_bool /**< blocking_read */, const size_t * /**< buffer_offset */, + const size_t * /**< host_offset */, const size_t * /**< region */, + size_t /**< buffer_row_pitch */, size_t /**< buffer_slice_pitch */, + size_t /**< host_row_pitch */, size_t /**< host_slice_pitch */, + void * /**< ptr */, cl_uint /**< num_events_in_wait_list */, + const cl_event * /**< event_wait_list */, cl_event * /**< event */); + typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithSource)( cl_context /**< context */, cl_uint /**< count */, const char ** /**< strings */, const size_t * /**< lengths */, @@ -161,6 +179,8 @@ extern PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer; extern PFN_clEnqueueReadBuffer clEnqueueReadBuffer; extern PFN_clEnqueueMapBuffer clEnqueueMapBuffer; extern PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject; +extern PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect; +extern PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect; extern PFN_clCreateProgramWithSource clCreateProgramWithSource; extern PFN_clCreateProgramWithBinary clCreateProgramWithBinary; extern PFN_clBuildProgram clBuildProgram; diff --git a/nntrainer/tensor/cl_operations/blas_kernels.cpp b/nntrainer/tensor/cl_operations/blas_kernels.cpp index 558111b5a8..3ceccbdcd3 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels.cpp +++ b/nntrainer/tensor/cl_operations/blas_kernels.cpp @@ -39,41 +39,40 @@ void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata, size_t dim1_size = sizeof(float) * dim1; size_t dim2_size = sizeof(float) * dim2; - opencl::Buffer inputA(cl_context_ref.context_inst_, - dim1 * dim2 * sizeof(float), true, nullptr); - opencl::Buffer inputX(cl_context_ref.context_inst_, dim2_size, true, - nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, dim1_size, true, - nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + result = clbuffInstance.readBufferA->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(float), + matAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.readBufferB->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim2_size, vecXdata); if (!result) { break; } - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.writeBufferA->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, vecYdata); if (!result) { break; } - result = kernel_sgemv_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_sgemv_ptr->SetKernelArguments(0, clbuffInstance.readBufferA, + sizeof(cl_mem)); if (!result) { break; } - result = kernel_sgemv_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_sgemv_ptr->SetKernelArguments(1, clbuffInstance.readBufferB, + sizeof(cl_mem)); if (!result) { break; } - result = kernel_sgemv_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + result = kernel_sgemv_ptr->SetKernelArguments( + 2, clbuffInstance.writeBufferA, sizeof(cl_mem)); if (!result) { break; } @@ -97,7 +96,8 @@ void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata, break; } - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.writeBufferA->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, vecYdata); if (!result) { break; } diff --git a/nntrainer/tensor/cl_operations/blas_kernels.h b/nntrainer/tensor/cl_operations/blas_kernels.h index 7a62148888..2f16089465 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels.h +++ b/nntrainer/tensor/cl_operations/blas_kernels.h @@ -14,15 +14,18 @@ #ifndef __BLAS_KERNELS_H__ #define __BLAS_KERNELS_H__ +#include #include #include #include + #include namespace nntrainer { // get global cl_context to use in kernels static ClContext cl_context_ref; +static ClBufferManager &clbuffInstance = ClBufferManager::getInstance(); /** * @brief sgemv computation : Y = A*X + Y From dab2d91121761580c9160dc5adc46ade34977c43 Mon Sep 17 00:00:00 2001 From: Debadri Samaddar Date: Thu, 5 Dec 2024 12:04:52 +0530 Subject: [PATCH 2/3] [GPU] Lazy initialization of Buffers Initialize buffer objects after command queue creation Signed-off-by: Debadri Samaddar --- nntrainer/cl_buffer_manager.cpp | 3 ++- nntrainer/cl_buffer_manager.h | 7 ++++++- nntrainer/cl_context.h | 8 +++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/nntrainer/cl_buffer_manager.cpp b/nntrainer/cl_buffer_manager.cpp index c2af78ca54..df5009dcc8 100644 --- a/nntrainer/cl_buffer_manager.cpp +++ b/nntrainer/cl_buffer_manager.cpp @@ -21,7 +21,7 @@ ClBufferManager &ClBufferManager::getInstance() { // to-do: Implementation to be updated with array of Buffer objects if required // fp16 Buffer objects to be added in future -ClBufferManager::ClBufferManager() { +void ClBufferManager::initBuffers() { readBufferA = new opencl::Buffer(context_inst_, buffer_size_bytes, true); readBufferB = new opencl::Buffer(context_inst_, buffer_size_bytes, true); readBufferC = new opencl::Buffer(context_inst_, buffer_size_bytes, true); @@ -36,6 +36,7 @@ ClBufferManager::~ClBufferManager() { delete readBufferC; delete writeBufferA; delete writeBufferB; + ml_logi("ClBufferManager: Buffers destroyed"); } } // namespace nntrainer diff --git a/nntrainer/cl_buffer_manager.h b/nntrainer/cl_buffer_manager.h index 875e7e9bc0..99b79ef4ca 100644 --- a/nntrainer/cl_buffer_manager.h +++ b/nntrainer/cl_buffer_manager.h @@ -34,7 +34,7 @@ class ClBufferManager { * @brief Private constructor to prevent object creation * */ - ClBufferManager(); + ClBufferManager(){}; /** * @brief OpenCl context global instance @@ -61,6 +61,11 @@ class ClBufferManager { opencl::Buffer *writeBufferA; opencl::Buffer *writeBufferB; + /** + * @brief Initialize Buffer objects. + */ + void initBuffers(); + /** * @brief Destroy Buffer pointers. * diff --git a/nntrainer/cl_context.h b/nntrainer/cl_context.h index 025365546b..e0b7209000 100644 --- a/nntrainer/cl_context.h +++ b/nntrainer/cl_context.h @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -79,12 +80,14 @@ class ClContext { template using FactoryMap = std::tuple...>; - // getting static instance of commandqueue and opencl context + // getting static instance of commandqueue, opencl context and buffermanager opencl::CommandQueueManager &command_queue_inst_ = opencl::CommandQueueManager::GetInstance(); opencl::ContextManager &context_inst_ = opencl::ContextManager::GetInstance(); + ClBufferManager &clbuffInstance = ClBufferManager::getInstance(); + /** * @brief Default constructor */ @@ -272,6 +275,9 @@ class ClContext { // getContext() called inside createCommandQueue which creates clContext bool result = command_queue_inst_.CreateCommandQueue(); + // initialize device buffers + clbuffInstance.initBuffers(); + cl_initialized = result; return cl_initialized; }; From 2633e0fef2e6a790e8ad9ecf7b334a5864b57c00 Mon Sep 17 00:00:00 2001 From: Debadri Samaddar Date: Tue, 10 Dec 2024 09:37:04 +0530 Subject: [PATCH 3/3] [GPU] Abstraction of cl_buffer_manager Adding abstraction in cl_buffer_manager using const for data size Signed-off-by: Debadri Samaddar --- nntrainer/cl_buffer_manager.cpp | 20 ++++---- nntrainer/cl_buffer_manager.h | 51 ++++++++++++++++--- .../tensor/cl_operations/blas_kernels.cpp | 18 +++---- 3 files changed, 62 insertions(+), 27 deletions(-) diff --git a/nntrainer/cl_buffer_manager.cpp b/nntrainer/cl_buffer_manager.cpp index df5009dcc8..cf61e80100 100644 --- a/nntrainer/cl_buffer_manager.cpp +++ b/nntrainer/cl_buffer_manager.cpp @@ -22,20 +22,20 @@ ClBufferManager &ClBufferManager::getInstance() { // to-do: Implementation to be updated with array of Buffer objects if required // fp16 Buffer objects to be added in future void ClBufferManager::initBuffers() { - readBufferA = new opencl::Buffer(context_inst_, buffer_size_bytes, true); - readBufferB = new opencl::Buffer(context_inst_, buffer_size_bytes, true); - readBufferC = new opencl::Buffer(context_inst_, buffer_size_bytes, true); - writeBufferA = new opencl::Buffer(context_inst_, buffer_size_bytes, false); - writeBufferB = new opencl::Buffer(context_inst_, buffer_size_bytes, false); + inBufferA = new opencl::Buffer(context_inst_, buffer_size_bytes, true); + inBufferB = new opencl::Buffer(context_inst_, buffer_size_bytes, true); + inBufferC = new opencl::Buffer(context_inst_, buffer_size_bytes, true); + outBufferA = new opencl::Buffer(context_inst_, buffer_size_bytes, false); + outBufferB = new opencl::Buffer(context_inst_, buffer_size_bytes, false); ml_logi("ClBufferManager: Buffers initialized"); } ClBufferManager::~ClBufferManager() { - delete readBufferA; - delete readBufferB; - delete readBufferC; - delete writeBufferA; - delete writeBufferB; + delete inBufferA; + delete inBufferB; + delete inBufferC; + delete outBufferA; + delete outBufferB; ml_logi("ClBufferManager: Buffers destroyed"); } diff --git a/nntrainer/cl_buffer_manager.h b/nntrainer/cl_buffer_manager.h index 99b79ef4ca..9e4ea2b9fa 100644 --- a/nntrainer/cl_buffer_manager.h +++ b/nntrainer/cl_buffer_manager.h @@ -34,7 +34,12 @@ class ClBufferManager { * @brief Private constructor to prevent object creation * */ - ClBufferManager(){}; + ClBufferManager() : + inBufferA(nullptr), + inBufferB(nullptr), + inBufferC(nullptr), + outBufferA(nullptr), + outBufferB(nullptr){}; /** * @brief OpenCl context global instance @@ -45,7 +50,13 @@ class ClBufferManager { /** * @brief Buffer size in bytes preset (256 mebibytes) */ - size_t buffer_size_bytes = 8192 * 8192 * sizeof(float); + const size_t buffer_size_bytes = 8192 * 8192 * sizeof(float); + + opencl::Buffer *inBufferA; + opencl::Buffer *inBufferB; + opencl::Buffer *inBufferC; + opencl::Buffer *outBufferA; + opencl::Buffer *outBufferB; public: /** @@ -55,17 +66,41 @@ class ClBufferManager { */ static ClBufferManager &getInstance(); - opencl::Buffer *readBufferA; - opencl::Buffer *readBufferB; - opencl::Buffer *readBufferC; - opencl::Buffer *writeBufferA; - opencl::Buffer *writeBufferB; - /** * @brief Initialize Buffer objects. */ void initBuffers(); + /** + * @brief Get read only inBufferA. + * @return opencl::Buffer* or nullptr if initBuffers() is not called + */ + opencl::Buffer *getInBufferA() { return inBufferA; } + + /** + * @brief Get read only inBufferB. + * @return opencl::Buffer* or nullptr if initBuffers() is not called + */ + opencl::Buffer *getInBufferB() { return inBufferB; } + + /** + * @brief Get read only inBufferC. + * @return opencl::Buffer* or nullptr if initBuffers() is not called + */ + opencl::Buffer *getInBufferC() { return inBufferC; } + + /** + * @brief Get read-write outBufferA. + * @return opencl::Buffer* or nullptr if initBuffers() is not called + */ + opencl::Buffer *getOutBufferA() { return outBufferA; } + + /** + * @brief Get read-write outBufferB. + * @return opencl::Buffer* or nullptr if initBuffers() is not called + */ + opencl::Buffer *getOutBufferB() { return outBufferB; } + /** * @brief Destroy Buffer pointers. * diff --git a/nntrainer/tensor/cl_operations/blas_kernels.cpp b/nntrainer/tensor/cl_operations/blas_kernels.cpp index 3ceccbdcd3..6c7751b8b0 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels.cpp +++ b/nntrainer/tensor/cl_operations/blas_kernels.cpp @@ -40,39 +40,39 @@ void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata, size_t dim1_size = sizeof(float) * dim1; size_t dim2_size = sizeof(float) * dim2; - result = clbuffInstance.readBufferA->WriteDataRegion( + result = clbuffInstance.getInBufferA()->WriteDataRegion( cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(float), matAdata); if (!result) { break; } - result = clbuffInstance.readBufferB->WriteDataRegion( + result = clbuffInstance.getInBufferB()->WriteDataRegion( cl_context_ref.command_queue_inst_, dim2_size, vecXdata); if (!result) { break; } - result = clbuffInstance.writeBufferA->WriteDataRegion( + result = clbuffInstance.getOutBufferA()->WriteDataRegion( cl_context_ref.command_queue_inst_, dim1_size, vecYdata); if (!result) { break; } - result = kernel_sgemv_ptr->SetKernelArguments(0, clbuffInstance.readBufferA, - sizeof(cl_mem)); + result = kernel_sgemv_ptr->SetKernelArguments( + 0, clbuffInstance.getInBufferA(), sizeof(cl_mem)); if (!result) { break; } - result = kernel_sgemv_ptr->SetKernelArguments(1, clbuffInstance.readBufferB, - sizeof(cl_mem)); + result = kernel_sgemv_ptr->SetKernelArguments( + 1, clbuffInstance.getInBufferB(), sizeof(cl_mem)); if (!result) { break; } result = kernel_sgemv_ptr->SetKernelArguments( - 2, clbuffInstance.writeBufferA, sizeof(cl_mem)); + 2, clbuffInstance.getOutBufferA(), sizeof(cl_mem)); if (!result) { break; } @@ -96,7 +96,7 @@ void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata, break; } - result = clbuffInstance.writeBufferA->ReadDataRegion( + result = clbuffInstance.getOutBufferA()->ReadDataRegion( cl_context_ref.command_queue_inst_, dim1_size, vecYdata); if (!result) { break;