Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vpl gpu accel impl #8

Open
wants to merge 23 commits into
base: vpl_source_final_perf
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
5b29e82
Init for DX11 accel policy
sivanov-work Aug 20, 2021
bec29e1
Add static callbacl into Dx11 Accel
sivanov-work Aug 23, 2021
dd0104a
Add alloc Texture
sivanov-work Aug 24, 2021
3da1023
Dx11 create surface
sivanov-work Aug 25, 2021
3fa4537
Integrate DX11 accel into Priv & Engine
sivanov-work Aug 26, 2021
d4ad981
Add staging texture & fix pipeline
sivanov-work Aug 27, 2021
46a6d74
Add Dx11 streaming perf test
sivanov-work Aug 30, 2021
52060be
Add draft version shared DX11 MediaFrame accessors
sivanov-work Sep 1, 2021
f0ad595
Rename lock-free counters
sivanov-work Sep 1, 2021
1ce83c3
Add comment for lock/unlock
sivanov-work Sep 2, 2021
c1d9450
Add comment for on_unlock
sivanov-work Sep 2, 2021
bec0626
Add Write DX11 accessor
sivanov-work Sep 2, 2021
b97e3a0
Move out code into elastic_barrier
sivanov-work Sep 3, 2021
e49992f
Add UT for shared_lock
sivanov-work Oct 11, 2021
51b0575
Remove deprecated CPU_ACCEL define in DX11 accel
sivanov-work Oct 11, 2021
bfcf2b0
Add UT for elastic_barrier
sivanov-work Oct 12, 2021
7b9624d
Split out DX11 alloc resources from accel file
sivanov-work Oct 13, 2021
8840307
Remove temporary changes in g* files
sivanov-work Oct 13, 2021
bd518a8
Fix Surface counter creation
sivanov-work Oct 13, 2021
f389c3b
Apply some style fix
sivanov-work Oct 14, 2021
d09b255
Add CComPtr in DX11 resource, encapsulate ctx & alloc in DX record
sivanov-work Oct 14, 2021
9bc258d
Move out lock access into DXItem
sivanov-work Oct 14, 2021
37d8ef6
Hide allocator in Dx11 resource Item an improve Lockable concept
sivanov-work Oct 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions modules/gapi/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,13 @@ if(HAVE_GAPI_ONEVPL)
src/streaming/onevpl/onevpl_cfg_params.cpp
src/streaming/onevpl/onevpl_data_provider_interface_exception.cpp
src/streaming/onevpl/accelerators/surface/cpu_frame_adapter.cpp
src/streaming/onevpl/accelerators/surface/dx11_frame_adapter.cpp
src/streaming/onevpl/accelerators/surface/surface.cpp
src/streaming/onevpl/accelerators/surface/surface_pool.cpp
src/streaming/onevpl/accelerators/utils/shared_lock.cpp
src/streaming/onevpl/accelerators/accel_policy_cpu.cpp
src/streaming/onevpl/accelerators/accel_policy_dx11.cpp
src/streaming/onevpl/accelerators/dx11_alloc_resource.cpp

src/streaming/onevpl/engine/engine_session.cpp
src/streaming/onevpl/engine/processing_engine_base.cpp
Expand Down
13 changes: 10 additions & 3 deletions modules/gapi/perf/streaming/gapi_streaming_source_perf_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ const std::string codec[] = {

using source_t = std::string;
using codec_t = std::string;
using source_description_t = std::tuple<source_t, codec_t>;
using accel_mode_t = std::string;
using source_description_t = std::tuple<source_t, codec_t, accel_mode_t>;

class OneVPLSourcePerfTest : public TestPerfParams<source_description_t> {};
class VideoCapSourcePerfTest : public TestPerfParams<source_t> {};
Expand All @@ -38,12 +39,16 @@ PERF_TEST_P_(OneVPLSourcePerfTest, TestPerformance)
const auto params = GetParam();
source_t src = findDataFile(get<0>(params));
codec_t type = get<1>(params);
accel_mode_t mode = get<2>(params);

std::vector<oneVPL_cfg_param> cfg_params {
oneVPL_cfg_param::create<std::string>("mfxImplDescription.Impl", "MFX_IMPL_TYPE_HARDWARE"),
oneVPL_cfg_param::create("mfxImplDescription.mfxDecoderDescription.decoder.CodecID", type),
};

if (!mode.empty()) {
cfg_params.push_back(oneVPL_cfg_param::create("mfxImplDescription.AccelerationMode", mode));
}
auto source_ptr = make_vpl_src(src, cfg_params);
Data out;
TEST_CYCLE()
Expand All @@ -70,8 +75,10 @@ PERF_TEST_P_(VideoCapSourcePerfTest, TestPerformance)
}

INSTANTIATE_TEST_CASE_P(Streaming, OneVPLSourcePerfTest,
Values(source_description_t(files[0], codec[0]),
source_description_t(files[1], codec[1])));
Values(source_description_t(files[0], codec[0], ""),
source_description_t(files[0], codec[0], "MFX_ACCEL_MODE_VIA_D3D11"),
source_description_t(files[1], codec[1], ""),
source_description_t(files[1], codec[1], "MFX_ACCEL_MODE_VIA_D3D11")));

INSTANTIATE_TEST_CASE_P(Streaming, VideoCapSourcePerfTest,
Values(files[0],
Expand Down
38 changes: 22 additions & 16 deletions modules/gapi/src/backends/render/grenderocv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include <opencv2/gapi/cpu/gcpukernel.hpp>
#include <opencv2/gapi/fluid/core.hpp>
#include "logger.hpp"

struct RenderOCVState
{
Expand Down Expand Up @@ -128,15 +129,13 @@ GAPI_OCV_KERNEL_ST(RenderFrameOCVImpl, cv::gapi::wip::draw::GRenderFrame, Render
out = in;

auto desc = out.desc();
auto w_out = out.access(cv::MediaFrame::Access::W);

auto out_y = cv::Mat(desc.size, CV_8UC1, w_out.ptr[0], w_out.stride[0]);
auto out_uv = cv::Mat(desc.size / 2, CV_8UC2, w_out.ptr[1], w_out.stride[1]);

auto r_in = in.access(cv::MediaFrame::Access::R);
cv::Mat upsample_uv, yuv;
{
auto r_in = in.access(cv::MediaFrame::Access::R);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How did these changes get here?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's safest R-W accessor implementation
According to well-known applied R-W concept: there should no simultaneous R & W access


auto in_y = cv::Mat(desc.size, CV_8UC1, r_in.ptr[0], r_in.stride[0]);
auto in_uv = cv::Mat(desc.size / 2, CV_8UC2, r_in.ptr[1], r_in.stride[1]);
auto in_y = cv::Mat(desc.size, CV_8UC1, r_in.ptr[0], r_in.stride[0]);
auto in_uv = cv::Mat(desc.size / 2, CV_8UC2, r_in.ptr[1], r_in.stride[1]);

/* FIXME How to render correctly on NV12 format ?
*
Expand All @@ -157,19 +156,26 @@ GAPI_OCV_KERNEL_ST(RenderFrameOCVImpl, cv::gapi::wip::draw::GRenderFrame, Render
*
*/

// NV12 -> YUV
cv::Mat upsample_uv, yuv;
cv::resize(in_uv, upsample_uv, in_uv.size() * 2, cv::INTER_LINEAR);
cv::merge(std::vector<cv::Mat>{in_y, upsample_uv}, yuv);
// NV12 -> YUV
cv::resize(in_uv, upsample_uv, in_uv.size() * 2, cv::INTER_LINEAR);
cv::merge(std::vector<cv::Mat>{in_y, upsample_uv}, yuv);
}

cv::gapi::wip::draw::drawPrimitivesOCVYUV(yuv, prims, state.ftpr);

// YUV -> NV12
cv::Mat out_u, out_v, uv_plane;
std::vector<cv::Mat> chs = { out_y, out_u, out_v };
cv::split(yuv, chs);
cv::merge(std::vector<cv::Mat>{chs[1], chs[2]}, uv_plane);
cv::resize(uv_plane, out_uv, uv_plane.size() / 2, cv::INTER_LINEAR);
{
auto w_out = out.access(cv::MediaFrame::Access::W);

auto out_y = cv::Mat(desc.size, CV_8UC1, w_out.ptr[0], w_out.stride[0]);
auto out_uv = cv::Mat(desc.size / 2, CV_8UC2, w_out.ptr[1], w_out.stride[1]);

cv::Mat out_u, out_v, uv_plane;
std::vector<cv::Mat> chs = { out_y, out_u, out_v };
cv::split(yuv, chs);
cv::merge(std::vector<cv::Mat>{chs[1], chs[2]}, uv_plane);
cv::resize(uv_plane, out_uv, uv_plane.size() / 2, cv::INTER_LINEAR);
}
}

static void setup(const cv::GFrameDesc& /* in_nv12 */,
Expand Down
128 changes: 128 additions & 0 deletions modules/gapi/src/streaming/onevpl/accelerators/accel_policy_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,98 @@
namespace cv {
namespace gapi {
namespace wip {
namespace utils {
/* UTILS */
mfxU32 GetSurfaceSize_(mfxU32 FourCC, mfxU32 width, mfxU32 height) {
mfxU32 nbytes = 0;

mfxU32 half_width = width / 2;
mfxU32 half_height = height / 2;
switch (FourCC) {
case MFX_FOURCC_I420:
case MFX_FOURCC_NV12:
nbytes = width * height + 2 * half_width * half_height;
break;
case MFX_FOURCC_I010:
case MFX_FOURCC_P010:
nbytes = width * height + 2 * half_width * half_height;
nbytes *= 2;
break;
case MFX_FOURCC_RGB4:
nbytes = width * height * 4;
break;
default:
break;
}

return nbytes;
}

surface_ptr_t create_surface_RGB4_(mfxFrameInfo frameInfo,
std::shared_ptr<void> out_buf_ptr,
size_t out_buf_ptr_offset,
size_t out_buf_size)
{
mfxU8* buf = reinterpret_cast<mfxU8*>(out_buf_ptr.get());
mfxU16 surfW = frameInfo.Width * 4;
mfxU16 surfH = frameInfo.Height;
(void)surfH;

// TODO more intelligent check
if (out_buf_size <= out_buf_ptr_offset) {
GAPI_LOG_WARNING(nullptr, "Not enough buffer, ptr: " << out_buf_ptr <<
", size: " << out_buf_size <<
", offset: " << out_buf_ptr_offset <<
", W: " << surfW <<
", H: " << surfH);
GAPI_Assert(false && "Invalid offset");
}

std::unique_ptr<mfxFrameSurface1> handle(new mfxFrameSurface1);
memset(handle.get(), 0, sizeof(mfxFrameSurface1));

handle->Info = frameInfo;
handle->Data.B = buf + out_buf_ptr_offset;
handle->Data.G = handle->Data.B + 1;
handle->Data.R = handle->Data.B + 2;
handle->Data.A = handle->Data.B + 3;
handle->Data.Pitch = surfW;

return Surface::create_surface(std::move(handle), out_buf_ptr);
}

surface_ptr_t create_surface_other_(mfxFrameInfo frameInfo,
std::shared_ptr<void> out_buf_ptr,
size_t out_buf_ptr_offset,
size_t out_buf_size)
{
mfxU8* buf = reinterpret_cast<mfxU8*>(out_buf_ptr.get());
mfxU16 surfH = frameInfo.Height;
mfxU16 surfW = (frameInfo.FourCC == MFX_FOURCC_P010) ? frameInfo.Width * 2 : frameInfo.Width;

// TODO more intelligent check
if (out_buf_size <=
out_buf_ptr_offset + (surfW * surfH) + ((surfW / 2) * (surfH / 2))) {
GAPI_LOG_WARNING(nullptr, "Not enough buffer, ptr: " << out_buf_ptr <<
", size: " << out_buf_size <<
", offset: " << out_buf_ptr_offset <<
", W: " << surfW <<
", H: " << surfH);
GAPI_Assert(false && "Invalid offset");
}

std::unique_ptr<mfxFrameSurface1> handle(new mfxFrameSurface1);
memset(handle.get(), 0, sizeof(mfxFrameSurface1));

handle->Info = frameInfo;
handle->Data.Y = buf + out_buf_ptr_offset;
handle->Data.U = buf + out_buf_ptr_offset + (surfW * surfH);
handle->Data.V = handle->Data.U + ((surfW / 2) * (surfH / 2));
handle->Data.Pitch = surfW;

return Surface::create_surface(std::move(handle), out_buf_ptr);
}
} // namespace utils

VPLCPUAccelerationPolicy::VPLCPUAccelerationPolicy() {
GAPI_LOG_INFO(nullptr, "created");
Expand All @@ -35,6 +127,10 @@ VPLCPUAccelerationPolicy::~VPLCPUAccelerationPolicy() {
GAPI_LOG_INFO(nullptr, "destroyed");
}

VPLAccelerationPolicy::AccelType VPLCPUAccelerationPolicy::get_accel_type() const {
return AccelType::CPU;
}

void VPLCPUAccelerationPolicy::init(session_t session) {
(void)session;
//MFXVideoCORE_SetFrameAllocator(session, mfxFrameAllocator instance)
Expand Down Expand Up @@ -120,6 +216,38 @@ VPLCPUAccelerationPolicy::create_surface_pool(size_t pool_size, size_t surface_s

return preallocated_pool_memory_ptr;
}
VPLCPUAccelerationPolicy::pool_key_t
VPLCPUAccelerationPolicy::create_surface_pool(const mfxFrameAllocRequest& alloc_request, mfxVideoParam& param) {

// External (application) allocation of decode surfaces
GAPI_LOG_DEBUG(nullptr, "Query mfxFrameAllocRequest.NumFrameSuggested: " << alloc_request.NumFrameSuggested <<
", mfxFrameAllocRequest.Type: " << alloc_request.Type);

mfxU32 singleSurfaceSize = utils::GetSurfaceSize_(param.mfx.FrameInfo.FourCC,
param.mfx.FrameInfo.Width,
param.mfx.FrameInfo.Height);
if (!singleSurfaceSize) {
throw std::runtime_error("Cannot determine surface size for: fourCC" +
std::to_string(param.mfx.FrameInfo.FourCC) +
", width: " + std::to_string(param.mfx.FrameInfo.Width) +
", height: " + std::to_string(param.mfx.FrameInfo.Height));
}

const auto &frameInfo = param.mfx.FrameInfo;
auto surface_creator =
[&frameInfo] (std::shared_ptr<void> out_buf_ptr, size_t out_buf_ptr_offset,
size_t out_buf_size) -> surface_ptr_t {
return (frameInfo.FourCC == MFX_FOURCC_RGB4) ?
utils::create_surface_RGB4_(frameInfo, out_buf_ptr, out_buf_ptr_offset,
out_buf_size) :
utils::create_surface_other_(frameInfo, out_buf_ptr, out_buf_ptr_offset,
out_buf_size);};

//TODO Configure preallocation size (how many frames we can hold)
const size_t preallocated_frames_count = 30;
return create_surface_pool(alloc_request.NumFrameSuggested * preallocated_frames_count,
singleSurfaceSize, surface_creator);
}

VPLCPUAccelerationPolicy::surface_weak_ptr_t VPLCPUAccelerationPolicy::get_free_surface(pool_key_t key) {
auto pool_it = pool_table.find(key);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@ struct VPLCPUAccelerationPolicy final : public VPLAccelerationPolicy

using pool_t = CachedPool;

GAPI_EXPORTS AccelType get_accel_type() const override;
GAPI_EXPORTS void init(session_t session) override;
GAPI_EXPORTS void deinit(session_t session) override;
GAPI_EXPORTS pool_key_t create_surface_pool(size_t pool_size, size_t surface_size_bytes, surface_ptr_ctr_t creator) override;
GAPI_EXPORTS pool_key_t create_surface_pool(size_t pool_size, size_t surface_size_bytes, surface_ptr_ctr_t creator);
GAPI_EXPORTS pool_key_t create_surface_pool(const mfxFrameAllocRequest& alloc_request, mfxVideoParam& param) override;
GAPI_EXPORTS surface_weak_ptr_t get_free_surface(pool_key_t key) override;
GAPI_EXPORTS size_t get_free_surface_count(pool_key_t key) const override;
GAPI_EXPORTS size_t get_surface_count(pool_key_t key) const override;
Expand Down
Loading