diff --git a/nntrainer/cl_buffer_manager.cpp b/nntrainer/cl_buffer_manager.cpp new file mode 100644 index 0000000000..c2af78ca54 --- /dev/null +++ b/nntrainer/cl_buffer_manager.cpp @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Debadri Samaddar + * + * @file cl_buffer_manager.cpp + * @date 01 Dec 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Debadri Samaddar + * @bug No known bugs except for NYI items + * @brief This file contains global Buffer objects and manages them + */ + +#include + +namespace nntrainer { + +ClBufferManager &ClBufferManager::getInstance() { + static ClBufferManager instance; + return instance; +} + +// to-do: Implementation to be updated with array of Buffer objects if required +// fp16 Buffer objects to be added in future +ClBufferManager::ClBufferManager() { + readBufferA = new opencl::Buffer(context_inst_, buffer_size_bytes, true); + readBufferB = new opencl::Buffer(context_inst_, buffer_size_bytes, true); + readBufferC = new opencl::Buffer(context_inst_, buffer_size_bytes, true); + writeBufferA = new opencl::Buffer(context_inst_, buffer_size_bytes, false); + writeBufferB = new opencl::Buffer(context_inst_, buffer_size_bytes, false); + ml_logi("ClBufferManager: Buffers initialized"); +} + +ClBufferManager::~ClBufferManager() { + delete readBufferA; + delete readBufferB; + delete readBufferC; + delete writeBufferA; + delete writeBufferB; +} + +} // namespace nntrainer diff --git a/nntrainer/cl_buffer_manager.h b/nntrainer/cl_buffer_manager.h new file mode 100644 index 0000000000..875e7e9bc0 --- /dev/null +++ b/nntrainer/cl_buffer_manager.h @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Debadri Samaddar + * + * @file cl_buffer_manager.h + * @date 01 Dec 2024 + * @see https://github.com/nnstreamer/nntrainer + * @author Debadri Samaddar + * @bug No known bugs except for NYI items + * @brief This file contains global Buffer objects and manages them + */ + +#ifndef __CL_BUFFER_MANAGER_H__ +#define __CL_BUFFER_MANAGER_H__ + +#include + +#include +#include + +#include + +namespace nntrainer { + +/** + * @class ClBufferManager contains Buffer object management + * @brief Support for Buffer management + */ + +class ClBufferManager { + +private: + /** + * @brief Private constructor to prevent object creation + * + */ + ClBufferManager(); + + /** + * @brief OpenCl context global instance + * + */ + opencl::ContextManager &context_inst_ = opencl::ContextManager::GetInstance(); + + /** + * @brief Buffer size in bytes preset (256 mebibytes) + */ + size_t buffer_size_bytes = 8192 * 8192 * sizeof(float); + +public: + /** + * @brief Get Global ClBufferManager. + * + * @return ClBufferManager& + */ + static ClBufferManager &getInstance(); + + opencl::Buffer *readBufferA; + opencl::Buffer *readBufferB; + opencl::Buffer *readBufferC; + opencl::Buffer *writeBufferA; + opencl::Buffer *writeBufferB; + + /** + * @brief Destroy Buffer pointers. + * + */ + ~ClBufferManager(); +}; +} // namespace nntrainer + +#endif /* __CL_BUFFER_MANAGER_H__ */ diff --git a/nntrainer/meson.build b/nntrainer/meson.build index 2fa705a756..ed15b8f2a7 100644 --- a/nntrainer/meson.build +++ b/nntrainer/meson.build @@ -64,7 +64,9 @@ nntrainer_common_sources = [ if get_option('enable-opencl') nntrainer_headers += meson.current_source_dir() / 'cl_context.h' + nntrainer_headers += meson.current_source_dir() / 'cl_buffer_manager.h' nntrainer_common_sources += 'cl_context.cpp' + nntrainer_common_sources += 'cl_buffer_manager.cpp' endif foreach s : nntrainer_common_sources diff --git a/nntrainer/opencl/opencl_buffer.cpp b/nntrainer/opencl/opencl_buffer.cpp index 1614f3e622..07ae213a94 100644 --- a/nntrainer/opencl/opencl_buffer.cpp +++ b/nntrainer/opencl/opencl_buffer.cpp @@ -27,7 +27,7 @@ namespace nntrainer::opencl { * @param read_only flag * @param data data for the buffer */ -Buffer::Buffer(ContextManager &context_manager, int size_in_bytes, +Buffer::Buffer(ContextManager &context_manager, size_t size_in_bytes, bool read_only, void *data) { cl_context context = context_manager.GetContext(); cl_mem_flags flags = read_only ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE; @@ -94,6 +94,20 @@ bool Buffer::WriteData(CommandQueueManager &command_queue_inst, return command_queue_inst.EnqueueWriteBuffer(mem_buf_, size_, data); } +bool Buffer::WriteDataRegion(CommandQueueManager &command_queue_inst, + size_t size_in_bytes, const void *data, + size_t host_origin_offset, + size_t buffer_origin_offset) { + if (size_in_bytes > size_) { + ml_loge("Failed to write buffer region. Region size(%lu bytes) greater " + "than buffer size(%lu bytes).", + size_in_bytes, size_); + return false; + } + return command_queue_inst.EnqueueWriteBufferRegion( + mem_buf_, size_in_bytes, data, host_origin_offset, buffer_origin_offset); +} + /** * @brief reading data from the buffer * @@ -105,6 +119,20 @@ bool Buffer::ReadData(CommandQueueManager &command_queue_inst, void *data) { return command_queue_inst.EnqueueReadBuffer(mem_buf_, size_, data); } +bool Buffer::ReadDataRegion(CommandQueueManager &command_queue_inst, + size_t size_in_bytes, void *data, + size_t host_origin_offset, + size_t buffer_origin_offset) { + if (size_in_bytes > size_) { + ml_loge("Failed to read from buffer region. Region size(%lu bytes) greater " + "than buffer size(%lu bytes).", + size_in_bytes, size_); + return false; + } + return command_queue_inst.EnqueueReadBufferRegion( + mem_buf_, size_in_bytes, data, host_origin_offset, buffer_origin_offset); +} + void *Buffer::MapBuffer(CommandQueueManager &command_queue_inst, size_t offset_in_bytes, size_t size_in_bytes, bool read_only, bool async) { diff --git a/nntrainer/opencl/opencl_buffer.h b/nntrainer/opencl/opencl_buffer.h index 031efe5cbf..beddb30339 100644 --- a/nntrainer/opencl/opencl_buffer.h +++ b/nntrainer/opencl/opencl_buffer.h @@ -54,8 +54,8 @@ class Buffer { * @param read_only flag * @param data data for the buffer */ - Buffer(ContextManager &context_manager, int size_in_bytes, bool read_only, - void *data); + Buffer(ContextManager &context_manager, size_t size_in_bytes, bool read_only, + void *data = nullptr); /** * @brief Move constructor for buffer by deleting the previous buffer @@ -106,6 +106,21 @@ class Buffer { */ bool WriteData(CommandQueueManager &command_queue_inst, const void *data); + /** + * @brief writing data to a buffer region + * + * @param command_queue_inst reference of command queue instance + * @param size_in_bytes size of region + * @param data pointer of region + * @param host_origin_offset offset in the host memory region + * @param buffer_origin_offset offset in the buffer memory region + * @return true if successful write or false otherwise + */ + bool WriteDataRegion(CommandQueueManager &command_queue_inst, + size_t size_in_bytes, const void *data, + size_t host_origin_offset = 0, + size_t buffer_origin_offset = 0); + /** * @brief reading data from the buffer * @@ -115,6 +130,21 @@ class Buffer { */ bool ReadData(CommandQueueManager &command_queue_inst, void *data); + /** + * @brief Reading data from a buffer region + * + * @param command_queue_inst reference of command queue instance + * @param size_in_bytes size of region + * @param data pointer of region + * @param host_origin_offset offset in the host memory region + * @param buffer_origin_offset offset in the buffer memory region + * @return true if successful write or false otherwise + */ + bool ReadDataRegion(CommandQueueManager &command_queue_inst, + size_t size_in_bytes, void *data, + size_t host_origin_offset = 0, + size_t buffer_origin_offset = 0); + /** * @brief Mapping buffer to host memory * diff --git a/nntrainer/opencl/opencl_command_queue_manager.cpp b/nntrainer/opencl/opencl_command_queue_manager.cpp index e1bffa9f28..78b32537c1 100644 --- a/nntrainer/opencl/opencl_command_queue_manager.cpp +++ b/nntrainer/opencl/opencl_command_queue_manager.cpp @@ -131,6 +131,41 @@ bool CommandQueueManager::EnqueueReadBuffer(cl_mem buffer, size_t size_in_bytes, return true; } +bool CommandQueueManager::EnqueueReadBufferRegion( + cl_mem buffer, size_t size_in_bytes, void *data, size_t host_origin_offset, + size_t buffer_origin_offset, bool async) { + + // managing synchronization + const cl_bool blocking = async ? CL_FALSE : CL_TRUE; + + // (x, y, z) offset in the memory region associated with buffer + const size_t buffer_origin[] = {buffer_origin_offset, 0, 0}; + // (x, y, z) offset in the memory region associated with host + const size_t host_origin[] = {host_origin_offset, 0, 0}; + // region defines the (width in bytes, height in rows, depth in slices) + const size_t region[] = {size_in_bytes, 1, 1}; + // length of each row in bytes + size_t row_pitch = region[0]; + // length of each 2D slice in bytes + size_t slice_pitch = region[0] * region[1]; + + // Buffer and host data are interpreted as 1D in this case + // hence row and slice pitch are same for both + cl_int error_code = clEnqueueReadBufferRect( + command_queue_, buffer, blocking, buffer_origin, host_origin, region, + row_pitch, slice_pitch, row_pitch, slice_pitch, data, 0, nullptr, nullptr); + + if (error_code != CL_SUCCESS) { + ml_loge("Failed to write data region to GPU (clEnqueueWriteBufferRect). " + "OpenCL error " + "code: %d", + error_code); + return false; + } + + return true; +} + /** * @brief Writing buffer object. Used from Buffer class * @@ -150,6 +185,7 @@ bool CommandQueueManager::EnqueueWriteBuffer(cl_mem buffer, auto error_code = clEnqueueWriteBuffer(command_queue_, buffer, blocking, 0, size_in_bytes, data, 0, nullptr, nullptr); + if (error_code != CL_SUCCESS) { ml_loge("Failed to upload data to GPU (clEnqueueWriteBuffer). OpenCL error " "code: %d", @@ -160,6 +196,41 @@ bool CommandQueueManager::EnqueueWriteBuffer(cl_mem buffer, return true; } +bool CommandQueueManager::EnqueueWriteBufferRegion( + cl_mem buffer, size_t size_in_bytes, const void *data, + size_t host_origin_offset, size_t buffer_origin_offset, bool async) { + + // managing synchronization + const cl_bool blocking = async ? CL_FALSE : CL_TRUE; + + // (x, y, z) offset in the memory region associated with buffer + const size_t buffer_origin[] = {buffer_origin_offset, 0, 0}; + // (x, y, z) offset in the memory region associated with host + const size_t host_origin[] = {host_origin_offset, 0, 0}; + // region defines the (width in bytes, height in rows, depth in slices) + const size_t region[] = {size_in_bytes, 1, 1}; + // length of each row in bytes + size_t row_pitch = region[0]; + // length of each 2D slice in bytes + size_t slice_pitch = region[0] * region[1]; + + // Buffer and host data are interpreted as 1D in this case + // hence row and slice pitch are same for both + cl_int error_code = clEnqueueWriteBufferRect( + command_queue_, buffer, blocking, buffer_origin, host_origin, region, + row_pitch, slice_pitch, row_pitch, slice_pitch, data, 0, nullptr, nullptr); + + if (error_code != CL_SUCCESS) { + ml_loge("Failed to write data region to GPU (clEnqueueWriteBufferRect). " + "OpenCL error " + "code: %d", + error_code); + return false; + } + + return true; +} + /** * @brief Mapping a region of a buffer object into the host address space * diff --git a/nntrainer/opencl/opencl_command_queue_manager.h b/nntrainer/opencl/opencl_command_queue_manager.h index 8f9965e654..7047f4a2c6 100644 --- a/nntrainer/opencl/opencl_command_queue_manager.h +++ b/nntrainer/opencl/opencl_command_queue_manager.h @@ -73,6 +73,22 @@ class CommandQueueManager { bool EnqueueReadBuffer(cl_mem buffer, size_t size_in_bytes, void *data, bool async = false); + /** + * @brief Reading 1D region from a buffer object. Used from Buffer class + * + * @param buffer cl_mem buffer object + * @param size_in_bytes size of data region + * @param data pointer for the region + * @param host_origin_offset offset in the host memory region + * @param buffer_origin_offset offset in the buffer memory region + * @param async flag for asynchronous operation + * @return true if reading is successful or false otherwise + */ + bool EnqueueReadBufferRegion(cl_mem buffer, size_t size_in_bytes, void *data, + size_t host_origin_offset = 0, + size_t buffer_origin_offset = 0, + bool async = false); + /** * @brief Writing buffer object. Used from Buffer class * @@ -85,6 +101,20 @@ class CommandQueueManager { bool EnqueueWriteBuffer(cl_mem buffer, size_t size_in_bytes, const void *data, bool async = false); + /** + * @brief Writing 1D region of a buffer object. Used from Buffer class + * + * @param buffer cl_mem buffer object + * @param size_in_bytes size of data region + * @param data pointer for the region + * @param origin_offset offset in the memory region + * @param async flag for asynchronous operation + * @return true if writing is successful or false otherwise + */ + bool EnqueueWriteBufferRegion(cl_mem buffer, size_t size_in_bytes, + const void *data, size_t host_origin_offset = 0, + size_t buffer_origin_offset = 0, + bool async = false); /** * @brief Mapping a region of a buffer object into the host address space * diff --git a/nntrainer/opencl/opencl_loader.cpp b/nntrainer/opencl/opencl_loader.cpp index e3cf3e73c2..8b39eace80 100644 --- a/nntrainer/opencl/opencl_loader.cpp +++ b/nntrainer/opencl/opencl_loader.cpp @@ -75,6 +75,8 @@ void LoadOpenCLFunctions(void *libopencl) { LoadFunction(clEnqueueReadBuffer); LoadFunction(clEnqueueMapBuffer); LoadFunction(clEnqueueUnmapMemObject); + LoadFunction(clEnqueueWriteBufferRect); + LoadFunction(clEnqueueReadBufferRect); LoadFunction(clCreateProgramWithSource); LoadFunction(clCreateProgramWithBinary); LoadFunction(clBuildProgram); @@ -102,6 +104,8 @@ PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer; PFN_clEnqueueReadBuffer clEnqueueReadBuffer; PFN_clEnqueueMapBuffer clEnqueueMapBuffer; PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject; +PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect; +PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect; PFN_clCreateProgramWithSource clCreateProgramWithSource; PFN_clCreateProgramWithBinary clCreateProgramWithBinary; PFN_clBuildProgram clBuildProgram; diff --git a/nntrainer/opencl/opencl_loader.h b/nntrainer/opencl/opencl_loader.h index 0aa2a5cfd6..20cdef58fb 100644 --- a/nntrainer/opencl/opencl_loader.h +++ b/nntrainer/opencl/opencl_loader.h @@ -87,6 +87,24 @@ typedef cl_int(CL_API_CALL *PFN_clEnqueueUnmapMemObject)( const cl_event * /**< event_wait_list */, cl_event * /**< event */ ); +typedef cl_int(CL_API_CALL *PFN_clEnqueueWriteBufferRect)( + cl_command_queue /**< command_queue */, cl_mem /**< buffer */, + cl_bool /**< blocking_write */, const size_t * /**< buffer_offset */, + const size_t * /**< host_offset */, const size_t * /**< region */, + size_t /**< buffer_row_pitch */, size_t /**< buffer_slice_pitch */, + size_t /**< host_row_pitch */, size_t /**< host_slice_pitch */, + const void * /**< ptr */, cl_uint /**< num_events_in_wait_list */, + const cl_event * /**< event_wait_list */, cl_event * /**< event */); + +typedef cl_int(CL_API_CALL *PFN_clEnqueueReadBufferRect)( + cl_command_queue /**< command_queue */, cl_mem /**< buffer */, + cl_bool /**< blocking_read */, const size_t * /**< buffer_offset */, + const size_t * /**< host_offset */, const size_t * /**< region */, + size_t /**< buffer_row_pitch */, size_t /**< buffer_slice_pitch */, + size_t /**< host_row_pitch */, size_t /**< host_slice_pitch */, + void * /**< ptr */, cl_uint /**< num_events_in_wait_list */, + const cl_event * /**< event_wait_list */, cl_event * /**< event */); + typedef cl_program(CL_API_CALL *PFN_clCreateProgramWithSource)( cl_context /**< context */, cl_uint /**< count */, const char ** /**< strings */, const size_t * /**< lengths */, @@ -161,6 +179,8 @@ extern PFN_clEnqueueWriteBuffer clEnqueueWriteBuffer; extern PFN_clEnqueueReadBuffer clEnqueueReadBuffer; extern PFN_clEnqueueMapBuffer clEnqueueMapBuffer; extern PFN_clEnqueueUnmapMemObject clEnqueueUnmapMemObject; +extern PFN_clEnqueueWriteBufferRect clEnqueueWriteBufferRect; +extern PFN_clEnqueueReadBufferRect clEnqueueReadBufferRect; extern PFN_clCreateProgramWithSource clCreateProgramWithSource; extern PFN_clCreateProgramWithBinary clCreateProgramWithBinary; extern PFN_clBuildProgram clBuildProgram; diff --git a/nntrainer/tensor/cl_operations/blas_kernels.cpp b/nntrainer/tensor/cl_operations/blas_kernels.cpp index 558111b5a8..bd4d3f279c 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels.cpp +++ b/nntrainer/tensor/cl_operations/blas_kernels.cpp @@ -39,41 +39,39 @@ void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata, size_t dim1_size = sizeof(float) * dim1; size_t dim2_size = sizeof(float) * dim2; - opencl::Buffer inputA(cl_context_ref.context_inst_, - dim1 * dim2 * sizeof(float), true, nullptr); - opencl::Buffer inputX(cl_context_ref.context_inst_, dim2_size, true, - nullptr); - - opencl::Buffer inOutY(cl_context_ref.context_inst_, dim1_size, true, - nullptr); - - result = inputA.WriteData(cl_context_ref.command_queue_inst_, matAdata); + result = clbuffInstance.readBufferA->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1 * dim2 * sizeof(float), matAdata); if (!result) { break; } - result = inputX.WriteData(cl_context_ref.command_queue_inst_, vecXdata); + result = clbuffInstance.readBufferB->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim2_size, vecXdata); if (!result) { break; } - result = inOutY.WriteData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.writeBufferA->WriteDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, vecYdata); if (!result) { break; } - result = kernel_sgemv_ptr->SetKernelArguments(0, &inputA, sizeof(cl_mem)); + result = kernel_sgemv_ptr->SetKernelArguments(0, clbuffInstance.readBufferA, + sizeof(cl_mem)); if (!result) { break; } - result = kernel_sgemv_ptr->SetKernelArguments(1, &inputX, sizeof(cl_mem)); + result = kernel_sgemv_ptr->SetKernelArguments(1, clbuffInstance.readBufferB, + sizeof(cl_mem)); if (!result) { break; } - result = kernel_sgemv_ptr->SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + result = kernel_sgemv_ptr->SetKernelArguments( + 2, clbuffInstance.writeBufferA, sizeof(cl_mem)); if (!result) { break; } @@ -97,7 +95,8 @@ void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata, break; } - result = inOutY.ReadData(cl_context_ref.command_queue_inst_, vecYdata); + result = clbuffInstance.writeBufferA->ReadDataRegion( + cl_context_ref.command_queue_inst_, dim1_size, vecYdata); if (!result) { break; } diff --git a/nntrainer/tensor/cl_operations/blas_kernels.h b/nntrainer/tensor/cl_operations/blas_kernels.h index 7a62148888..2f16089465 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels.h +++ b/nntrainer/tensor/cl_operations/blas_kernels.h @@ -14,15 +14,18 @@ #ifndef __BLAS_KERNELS_H__ #define __BLAS_KERNELS_H__ +#include #include #include #include + #include namespace nntrainer { // get global cl_context to use in kernels static ClContext cl_context_ref; +static ClBufferManager &clbuffInstance = ClBufferManager::getInstance(); /** * @brief sgemv computation : Y = A*X + Y