From 9f3282b348bd450ef3f3fbb64149392400e5460d Mon Sep 17 00:00:00 2001 From: bibek <108366729+bghimireamd@users.noreply.github.com> Date: Thu, 17 Oct 2024 11:59:01 -0500 Subject: [PATCH 1/4] Enable NCHW/NHWC and NCDHW/NDHWC layout in batch norm driver command (#3234) --- driver/bn_driver.hpp | 1377 +++++++++-------- driver/conv_driver.hpp | 129 -- driver/dm_bnorm.cpp | 4 + driver/driver.hpp | 157 +- fin | 2 +- include/miopen/miopen.h | 173 +++ src/batch_norm.cpp | 16 +- src/batch_norm_api.cpp | 246 ++- src/driver_arguments.cpp | 26 +- src/fusion.cpp | 1 + src/include/miopen/batch_norm.hpp | 77 +- .../miopen/batchnorm/problem_description.hpp | 40 +- src/include/miopen/driver_arguments.hpp | 1 + .../miopen/fusion/problem_description.hpp | 15 +- src/ocl/batchnormocl.cpp | 97 +- src/solver/batchnorm/backward_ck.cpp | 3 +- .../batchnorm/backward_per_activation.cpp | 2 + .../batchnorm/backward_spatial_multiple.cpp | 4 + .../batchnorm/backward_spatial_single.cpp | 2 + test/bn_3d_peract_test.cpp | 15 + test/bn_3d_spatial_test.cpp | 15 + test/bn_peract_test.cpp | 15 + test/bn_spatial_test.cpp | 15 + test/fusionHost.hpp | 184 ++- test/gtest/bn.hpp | 225 ++- test/gtest/bn_bwd.cpp | 117 +- test/gtest/bn_fwd_train.cpp | 113 +- test/gtest/bn_infer.cpp | 126 +- test/gtest/bn_test_data.hpp | 20 +- 29 files changed, 2064 insertions(+), 1153 deletions(-) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index 4b94ac42d8..238b4ea1e6 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -33,12 +33,16 @@ #include "tensor_driver.hpp" #include "timer.hpp" #include "util_driver.hpp" +#include "rocrand_wrapper.hpp" #include "../test/verify.hpp" +#include "../test/random.hpp" +#include "../test/fusionHost.hpp" #include #include #include +#include "miopen/batch_norm.hpp" #include #include @@ -66,16 +70,7 @@ template class BatchNormDriver : public Driver { public: - BatchNormDriver() : Driver() - { - miopenCreateTensorDescriptor(&inputTensor); - miopenCreateTensorDescriptor(&outputTensor); - miopenCreateTensorDescriptor(&biasScaleTensor); - miopenCreateTensorDescriptor(&dxOutputTensor); - miopenCreateTensorDescriptor(&dyInputTensor); - - data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; - } + BatchNormDriver() : Driver() { data_type = (sizeof(Tgpu) == 4) ? miopenFloat : miopenHalf; } int AddCmdLineArgs() override; int ParseCmdLineArgs(int argc, char* argv[]) override; @@ -99,25 +94,25 @@ class BatchNormDriver : public Driver void runGPUFwdTrain(Tref epsilon, Tref eAF, float alpha, float beta); void runGPUBwd(Tref epsilon, float alpha, float beta); - void runCPUFwdInference( - Tref epsilon, int batch_sz, int channels, int height, int width, int depth = 0); - void runCPUFwdTrain( - Tref epsilon, Tref eAF, int batch_sz, int channels, int height, int width, int depth = 0); + void runCPUFwdInference(Tref epsilon); + void runCPUFwdTrain(Tref epsilon, Tref eAF); int VerifyBackward() override; int VerifyForward() override; - ~BatchNormDriver() override - { - miopenDestroyTensorDescriptor(outputTensor); - miopenDestroyTensorDescriptor(inputTensor); - miopenDestroyTensorDescriptor(biasScaleTensor); - miopenDestroyTensorDescriptor(dxOutputTensor); - miopenDestroyTensorDescriptor(dyInputTensor); - } + // Helper function to check the Layout type short names + bool ChkLayout_ShortName(); + // function to validate the Layout type parameters. + // layout parameter value to std (NCHW/NHWC/NCDHW/NDHWC) values, + // defined in MIOpen lib. + void ValidateLayoutInputParameters(std::string layout_type); + + ~BatchNormDriver() override {} private: miopenBatchNormMode_t bn_mode; + miopenActivationMode_t activ_mode = miopenActivationRELU; + bool saveMeanVar; bool bsaveMeanVar; bool keepRunningMeanVar; @@ -126,63 +121,53 @@ class BatchNormDriver : public Driver int forw; int back; + bool isFwdInfer = false; + bool isFwdTrain = false; + bool isBwd = false; + InputFlags inflags; bool isDepthSpecified = false; - miopenTensorDescriptor_t inputTensor; - miopenTensorDescriptor_t biasScaleTensor; - miopenTensorDescriptor_t outputTensor; - - // Backwards - miopenTensorDescriptor_t dyInputTensor; - miopenTensorDescriptor_t dxOutputTensor; - - std::unique_ptr dyin_dev; // this is the output of fwd - std::unique_ptr in_dev; - std::unique_ptr out_dev; - std::unique_ptr scale_dev; - std::unique_ptr bias_dev; - - std::unique_ptr dxout_dev; - std::unique_ptr dscale_dev; - std::unique_ptr dbias_dev; - - std::unique_ptr runningMean_dev; - std::unique_ptr runningVariance_dev; - std::unique_ptr saveMean_dev; - std::unique_ptr saveInvVariance_dev; - - std::vector dyin; // output of forward - std::vector in; - std::vector out; - std::vector out_host; - std::vector dxout; - std::vector dxout_host; - - std::vector scale; - std::vector scale_host; - std::vector bias; - std::vector bias_host; - - std::vector dscale; - std::vector dscale_host; - std::vector dbias; - std::vector dbias_host; - - std::vector runningMean; - std::vector runningVariance; - std::vector runningMean_host; - std::vector runningVariance_host; - - std::vector saveMean; - std::vector saveInvVariance; - - std::vector saveMean_host; - std::vector saveInvVariance_host; - - int createSaveBuffers(); - int createRunningBuffers(); + GpumemTensor in; + GpumemTensor out; + tensor out_ref; + + // forward + GpumemTensor scale; + GpumemTensor bias; + + // forward inference + GpumemTensor estMean; + GpumemTensor estVariance; + + GpumemTensor savedMean; + tensor savedMean_ref; + + // forward training + GpumemTensor savedVariance; + GpumemTensor runMean; + GpumemTensor runVariance; + // ref + tensor savedVariance_ref; + tensor runMean_ref; + tensor runVariance_ref; + + // backward needed different type for bwd. + GpumemTensor out_bwd; + + GpumemTensor bnScale; + GpumemTensor dScale; + GpumemTensor dBias; + // savedMean declared above as Tmix as well + GpumemTensor savedInvVar; + GpumemTensor dy; + + tensor dBias_ref; + tensor dScale_ref; + Tref maxval; + + miopenTensorLayout_t bn_layout; }; template @@ -202,46 +187,84 @@ template int BatchNormDriver::GetandSetData() { + std::vector in_len = GetInputTensorLengthsFromCmdLine(); SetBNParametersFromCmdLineArgs(); - std::vector in_len = GetInputTensorLengthsFromCmdLine(); + auto gen_value = [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }; - std::vector sb_len; - if(bn_mode == miopenBNPerActivation) + in.AllocOnHost(tensor{bn_layout, in_len}); + in.InitHostData(in.GetTensor().desc.GetElementSize(), true, gen_value); + + auto derivedBnDesc = miopen::TensorDescriptor{}; + miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode); + + if(isFwdInfer || isFwdTrain) { - // 1xCxHxW | in_len.size = 4 - sb_len.push_back(1); - sb_len.push_back(in_len[1]); - sb_len.push_back(in_len[2]); - sb_len.push_back(in_len[3]); + out.AllocOnHost(tensor{bn_layout, in_len}); + scale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + bias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - // 1xCxDxHxW | in_len.size = 5 - if(in_len.size() == 5) - { - sb_len.push_back(in_len[4]); - } + auto gen_value_scale_bias = [](auto...) { + return prng::gen_descreet_uniform_sign(1e-2, 100); + }; + + scale.InitHostData(scale.GetTensor().desc.GetElementSize(), true, gen_value_scale_bias); + bias.InitHostData(bias.GetTensor().desc.GetElementSize(), true, gen_value_scale_bias); } - else if(bn_mode == miopenBNSpatial) - { // 1xCx1x1 - sb_len.push_back(1); - sb_len.push_back(in_len[1]); - sb_len.push_back(1); - sb_len.push_back(1); + if(isFwdInfer) + { + estMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + estVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - // 1xCx1x1x1 - if(in_len.size() == 5) - { - sb_len.push_back(1); - } + auto gen_value_emean = [](auto...) { + return prng::gen_descreet_uniform_sign(1e-2, 100); + }; + estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(), true, gen_value_emean); + } + else if(isFwdTrain) + { + savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + runMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + runVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + + auto gen_var = [](auto...) { + return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + }; + runMean.InitHostData(runMean.GetTensor().desc.GetElementSize(), true, gen_var); + runVariance.InitHostData(runVariance.GetTensor().desc.GetElementSize(), true, gen_var); } + else if(isBwd) + { + out_bwd.AllocOnHost(tensor{bn_layout, in_len}); + + bnScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + dy.AllocOnHost(tensor{bn_layout, in_len}); + + auto gen_var_bwd = [](auto...) { + return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + }; + dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_var_bwd); - SetTensorNd(inputTensor, in_len, data_type); - SetTensorNd(biasScaleTensor, sb_len, ((sizeof(Tmix) == 4) ? miopenFloat : miopenHalf)); - SetTensorNd(outputTensor, in_len, data_type); + dScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + dBias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); + savedInvVar.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - // backwards - SetTensorNd(dyInputTensor, in_len, data_type); - SetTensorNd(dxOutputTensor, in_len, data_type); + bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value); + + savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_var_bwd); + + auto gen_in_var = [](auto...) { + return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); + }; + savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var); + } + else + { + std::cout << "\nUnknown batch norm state!\n"; + exit(EXIT_FAILURE); + } return miopenStatusSuccess; } @@ -265,6 +288,10 @@ int BatchNormDriver::AddCmdLineArgs() inflags.AddInputFlag("in_h", 'H', "32", "Input Height (Default=32)", "int"); inflags.AddInputFlag("in_w", 'W', "32", "Input Width (Default=32)", "int"); inflags.AddInputFlag("in_d", 'D', "0", "Input Depth (Default=0)", "int"); + + inflags.AddInputFlag( + "layout", 'L', "", "Layout (Default=NCHW for 2d conv, NCDHW for 3d conv)", "string", true); + inflags.AddInputFlag("alpha", 'A', "1.0", "Alpha (Default=1.0)", "float"); inflags.AddInputFlag("beta", 'B', "0.", "Beta (Default=0.)", "float"); inflags.AddInputFlag("iter", 'i', "1", "Number of Iterations (Default=1)", "int"); @@ -317,6 +344,39 @@ std::vector BatchNormDriver::GetInputTensorLengthsFromCmd } } +template +bool BatchNormDriver::ChkLayout_ShortName() +{ + // check for short name of layout type + if(inflags.FindShortName("layout") == 'L') + { + // do noting + // found valid short names + return true; + } + else + { + std::cerr << "Error:Invalid Short Name for layout!" << std::endl; + exit(EXIT_FAILURE); + } +} + +template +void BatchNormDriver::ValidateLayoutInputParameters(std::string layout_value) +{ + if(!ChkLayout_ShortName()) + { + std::cerr << "Invalid Layout Short Name = " << inflags.FindShortName("layout") << std::endl; + exit(EXIT_FAILURE); + } + if((layout_value.compare("NCHW") != 0) && (layout_value.compare("NHWC") != 0) && + (layout_value.compare("NCDHW") != 0) && (layout_value.compare("NDHWC") != 0)) + { + std::cerr << "Invalid Layout Parameter Value - " << layout_value << std::endl; + exit(EXIT_FAILURE); + } +} + template int BatchNormDriver::SetBNParametersFromCmdLineArgs() { @@ -324,6 +384,45 @@ int BatchNormDriver::SetBNParametersFromCmdLineArgs() // double bnAlpha = inflags.GetValueDouble("alpha"); // double bnBeta = inflags.GetValueDouble("beta"); + const std::string default_layout = isDepthSpecified ? "NCDHW" : "NCHW"; + + // inflags value is empty, default value is used + // if it is supplied via cmd line, check the value. + if(inflags.GetValueStr("layout").empty()) + { + inflags.SetValue("layout", default_layout); + } + else + { + std::string layoutValue = inflags.GetValueStr("layout"); + ValidateLayoutInputParameters(layoutValue); + inflags.SetValue("layout", layoutValue); + } + + std::string layout = inflags.GetValueStr("layout"); + + if(layout == "NCHW") + { + bn_layout = miopenTensorNCHW; + } + else if(layout == "NHWC") + { + bn_layout = miopenTensorNHWC; + } + else if(layout == "NCDHW") + { + bn_layout = miopenTensorNCDHW; + } + else if(layout == "NDHWC") + { + bn_layout = miopenTensorNDHWC; + } + else + { + std::cout << "Cannot handle layout : " << layout << "\n"; + exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe) + } + // batch norm mode type if(inflags.GetValueInt("mode") == 0) { @@ -395,242 +494,100 @@ int BatchNormDriver::SetBNParametersFromCmdLineArgs() forw = 1; } - return miopenStatusSuccess; -} - -template -int BatchNormDriver::createSaveBuffers() -{ - - status_t status = STATUS_SUCCESS; - DEFINE_CONTEXT(ctx); -#if MIOPEN_BACKEND_OPENCL - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); -#endif - - size_t sb_sz = GetTensorSize(biasScaleTensor); - - if(saveMeanVar) + if(forw == 1) { - // GPU allocation - saveMean_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - saveInvVariance_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - - if(back == 1) - { - // GPU host allocation - saveMean = std::vector(sb_sz, static_cast(0)); - saveInvVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - saveMean_host = std::vector(sb_sz, static_cast(0)); - saveInvVariance_host = std::vector(sb_sz, static_cast(0)); - - // Populate - for(int i = 0; i < sb_sz; i++) - { - saveMean[i] = prng::gen_canonical(); - saveMean_host[i] = static_cast(saveMean[i]); - saveInvVariance[i] = prng::gen_canonical(); - saveInvVariance_host[i] = static_cast(saveInvVariance[i]); - } - } - else - { - // GPU host allocation - saveMean = std::vector(sb_sz, static_cast(0)); - saveInvVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - saveMean_host = std::vector(sb_sz, static_cast(0)); - saveInvVariance_host = std::vector(sb_sz, static_cast(0)); - } - // GPU data transfer - status |= saveMean_dev->ToGPU(q, saveMean.data()); - status |= saveInvVariance_dev->ToGPU(q, saveInvVariance.data()); + isFwdTrain = true; + } + else if(forw == 2) + { + isFwdInfer = true; } else { - saveMean_dev = nullptr; - saveInvVariance_dev = nullptr; + isBwd = true; } - if(status != STATUS_SUCCESS) - printf("Error copying data to GPU\n"); - return miopenStatusSuccess; } template -int BatchNormDriver::createRunningBuffers() +int BatchNormDriver::AllocateBuffersAndCopy() { status_t status = STATUS_SUCCESS; DEFINE_CONTEXT(ctx); #if MIOPEN_BACKEND_OPENCL clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); #endif - size_t sb_sz = GetTensorSize(biasScaleTensor); + status |= in.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&in.GetTensor().desc)); - if(keepRunningMeanVar) + if(isFwdInfer || isFwdTrain) { - // GPU allocation - runningMean_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - runningVariance_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - - if(forw == 2) - { - // GPU host allocation - runningMean = std::vector(sb_sz, static_cast(0)); - runningVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - runningMean_host = std::vector(sb_sz, static_cast(0)); - runningVariance_host = std::vector(sb_sz, static_cast(0)); - - // Populate - for(int i = 0; i < sb_sz; i++) - { - runningMean[i] = prng::gen_canonical(); - runningMean_host[i] = static_cast(runningMean[i]); - runningVariance[i] = prng::gen_canonical(); - runningVariance_host[i] = static_cast(runningVariance[i]); - } - } - else - { - // GPU host allocation - runningMean = std::vector(sb_sz, static_cast(0)); - runningVariance = std::vector(sb_sz, static_cast(0)); - - // CPU allocation - runningMean_host = std::vector(sb_sz, static_cast(0)); - runningVariance_host = std::vector(sb_sz, static_cast(0)); - } - - // GPU data transfer - status |= runningMean_dev->ToGPU(q, runningMean.data()); - status |= runningVariance_dev->ToGPU(q, runningVariance.data()); + status |= out.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&out.GetTensor().desc)); + out_ref = + tensor{out.GetTensor().desc.GetLayout_t(), out.GetTensor().desc.GetLengths()}; + status |= scale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&scale.GetTensor().desc)); + status |= bias.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&bias.GetTensor().desc)); } - else + if(isFwdInfer) { - runningMean_dev = nullptr; - runningVariance_dev = nullptr; + status |= estMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&estMean.GetTensor().desc)); + status |= + estVariance.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&estVariance.GetTensor().desc)); } - if(status != STATUS_SUCCESS) - printf("Error copying data to GPU\n"); - - return miopenStatusSuccess; -} - -template -int BatchNormDriver::AllocateBuffersAndCopy() -{ - status_t status = STATUS_SUCCESS; - DEFINE_CONTEXT(ctx); -#if MIOPEN_BACKEND_OPENCL - clGetCommandQueueInfo(q, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, nullptr); -#endif - - size_t in_sz = GetTensorSize(inputTensor); - size_t sb_sz = GetTensorSize(biasScaleTensor); - - if(forw) + if(isFwdTrain) { + status |= + savedMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedMean.GetTensor().desc)); + status |= savedVariance.AllocOnDeviceAndInit( + q, ctx, GetTensorSize(&savedVariance.GetTensor().desc)); + status |= runMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&runMean.GetTensor().desc)); + status |= + runVariance.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&runVariance.GetTensor().desc)); - size_t out_sz = GetTensorSize(outputTensor); - - // GPU allocation - in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); - scale_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - bias_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - out_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); - - // GPU host allocation - in = std::vector(in_sz, static_cast(0)); - out = std::vector(out_sz, static_cast(0)); - scale = std::vector(sb_sz, static_cast(0)); - bias = std::vector(sb_sz, static_cast(0)); + savedMean_ref = tensor{savedMean.GetTensor().desc.GetLayout_t(), + savedMean.GetTensor().desc.GetLengths()}; - // CPU allocation - out_host = std::vector(out_sz, static_cast(0)); - scale_host = std::vector(sb_sz, static_cast(0)); - bias_host = std::vector(sb_sz, static_cast(0)); + savedVariance_ref = tensor{savedVariance.GetTensor().desc.GetLayout_t(), + savedVariance.GetTensor().desc.GetLengths()}; - // Data initialization - for(int i = 0; i < in_sz; i++) - { - in[i] = prng::gen_canonical(); - } - status |= in_dev->ToGPU(q, in.data()); + runMean_ref = tensor{runMean.GetTensor().desc.GetLayout_t(), + runMean.GetTensor().desc.GetLengths()}; - // Using random beta and gamma - for(int i = 0; i < sb_sz; i++) - { - scale[i] = prng::gen_canonical(); - scale_host[i] = static_cast(scale[i]); - bias[i] = prng::gen_canonical(); - bias_host[i] = static_cast(bias[i]); - } - status |= scale_dev->ToGPU(q, scale.data()); - status |= bias_dev->ToGPU(q, bias.data()); - status |= out_dev->ToGPU(q, out.data()); - - if(forw == 1) - { // training - status |= createRunningBuffers(); - status |= createSaveBuffers(); - } - else if(forw == 2) - { // inference - status |= createRunningBuffers(); - } - } // end forward - - if(back == 1) + runVariance_ref = tensor{runVariance.GetTensor().desc.GetLayout_t(), + runVariance.GetTensor().desc.GetLengths()}; + } + if(isBwd) { + status |= out_bwd.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&out_bwd.GetTensor().desc)); - size_t out_sz = GetTensorSize(dxOutputTensor); + out_ref = tensor{out_bwd.GetTensor().desc.GetLayout_t(), + out_bwd.GetTensor().desc.GetLengths()}; - // GPU allocation - in_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); - dyin_dev = std::unique_ptr(new GPUMem(ctx, in_sz, sizeof(Tgpu))); - dxout_dev = std::unique_ptr(new GPUMem(ctx, out_sz, sizeof(Tgpu))); - dscale_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - dbias_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); - scale_dev = std::unique_ptr(new GPUMem(ctx, sb_sz, sizeof(Tmix))); + status |= bnScale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&bnScale.GetTensor().desc)); + status |= dy.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dy.GetTensor().desc)); - // GPU host allocation - in = std::vector(in_sz, static_cast(0)); - dyin = std::vector(in_sz, static_cast(0)); - dxout = std::vector(out_sz, static_cast(0)); - dscale = std::vector(sb_sz, static_cast(0)); - dbias = std::vector(sb_sz, static_cast(0)); - scale = std::vector(sb_sz, static_cast(0)); + status |= dScale.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dScale.GetTensor().desc)); + status |= dBias.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&dBias.GetTensor().desc)); + status |= + savedMean.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedMean.GetTensor().desc)); + status |= + savedInvVar.AllocOnDeviceAndInit(q, ctx, GetTensorSize(&savedInvVar.GetTensor().desc)); - // CPU allocation - dxout_host = std::vector(out_sz, static_cast(0)); - dscale_host = std::vector(sb_sz, static_cast(0)); - dbias_host = std::vector(sb_sz, static_cast(0)); + dScale_ref = tensor{dScale.GetTensor().desc.GetLayout_t(), + dScale.GetTensor().desc.GetLengths()}; - // Populate - for(int i = 0; i < sb_sz; i++) - { - scale[i] = prng::gen_canonical(); - } - status |= scale_dev->ToGPU(q, scale.data()); - status |= dscale_dev->ToGPU(q, dscale.data()); - status |= dbias_dev->ToGPU(q, dbias.data()); + dBias_ref = + tensor{dBias.GetTensor().desc.GetLayout_t(), dBias.GetTensor().desc.GetLengths()}; + } - for(int i = 0; i < in_sz; i++) - { - dyin[i] = prng::gen_canonical(); - in[i] = prng::gen_canonical(); - } - status |= dyin_dev->ToGPU(q, dyin.data()); - status |= in_dev->ToGPU(q, in.data()); - status |= dxout_dev->ToGPU(q, dxout.data()); + for(size_t i = 0; i < runMean.GetVector().size(); ++i) + { + runMean_ref.data[i] = static_cast(runMean.GetVector()[i]); + } - status |= createSaveBuffers(); + for(size_t i = 0; i < runVariance.GetVector().size(); ++i) + { + runVariance_ref.data[i] = static_cast(runVariance.GetVector()[i]); } if(status != STATUS_SUCCESS) @@ -645,37 +602,43 @@ void BatchNormDriver::runGPUFwdInference(Tref epsilon, float a if(keepRunningMeanVar) { // use precalculated mean and variance - miopenBatchNormalizationForwardInference(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - runningMean_dev->GetMem(), - runningVariance_dev->GetMem(), - epsilon); + miopenBatchNormalizationForwardInference_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &estMean.GetTensor().desc, + &estVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + estMean.GetDevicePtr(), + estVariance.GetDevicePtr(), + epsilon); } else { // recalculate mean and variance - miopenBatchNormalizationForwardInference(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - nullptr, - nullptr, - epsilon); + miopenBatchNormalizationForwardInference_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &estMean.GetTensor().desc, + &estVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + nullptr, + nullptr, + epsilon); } return; @@ -689,103 +652,118 @@ void BatchNormDriver::runGPUFwdTrain(Tref epsilon, { if(saveMeanVar && keepRunningMeanVar) { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - runningMean_dev->GetMem(), - runningVariance_dev->GetMem(), - epsilon, - saveMean_dev->GetMem(), - saveInvVariance_dev->GetMem()); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + runMean.GetDevicePtr(), + runVariance.GetDevicePtr(), + epsilon, + savedMean.GetDevicePtr(), + savedVariance.GetDevicePtr()); } else if(saveMeanVar) { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - nullptr, - nullptr, - epsilon, - saveMean_dev->GetMem(), - saveInvVariance_dev->GetMem()); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + savedMean.GetDevicePtr(), + savedVariance.GetDevicePtr()); } else if(keepRunningMeanVar) { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - runningMean_dev->GetMem(), - runningVariance_dev->GetMem(), - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + runMean.GetDevicePtr(), + runVariance.GetDevicePtr(), + epsilon, + nullptr, + nullptr); } else { - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - nullptr, - nullptr, - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + nullptr, + nullptr); } #ifdef BN_RUNFOR_PROFILER - miopenBatchNormalizationForwardTraining(GetHandle(), - bn_mode, - &alpha, - &beta, - inputTensor, - in_dev->GetMem(), - outputTensor, - out_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - bias_dev->GetMem(), - eAF, - nullptr, - nullptr, - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationForwardTraining_V2(GetHandle(), + bn_mode, + &alpha, + &beta, + &in.GetTensor().desc, + in.GetDevicePtr(), + &out.GetTensor().desc, + out.GetDevicePtr(), + &scale.GetTensor().desc, + &bias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedVariance.GetTensor().desc, + scale.GetDevicePtr(), + bias.GetDevicePtr(), + eAF, + nullptr, + nullptr, + epsilon, + nullptr, + nullptr); #endif } @@ -866,9 +844,9 @@ int BatchNormDriver::RunForwardGPU() avgtime / (iters - 1), iters - 1); int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); + std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths()); size_t M = in_n * in_c * in_h * in_w; - size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); + size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType()); float rdCnt = -1.0; float wrCnt = 1.0; if(forw == 1) @@ -890,41 +868,44 @@ int BatchNormDriver::RunForwardGPU() } template -void BatchNormDriver::runCPUFwdInference( - Tref epsilon, int batch_sz, int channels, int height, int width, int depth) +void BatchNormDriver::runCPUFwdInference(Tref epsilon) { + int size{0}; + miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size); + + if(size == 5) + { + in.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc); + out_ref.desc = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc); + scale.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(scale.GetTensor().desc); + bias.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(bias.GetTensor().desc); + estMean.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(estMean.GetTensor().desc); + estVariance.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(estVariance.GetTensor().desc); + } if(bn_mode == miopenBNPerActivation) { // 1xCxHxW - miopenBNFwdInferPerActivationRunHost(/* alpha, beta, */ batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - keepRunningMeanVar, - runningMean_host.data(), - runningVariance_host.data()); + // handle 3d case + batchNormPerActivHostInference(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + epsilon, + estMean.GetTensor(), + estVariance.GetTensor()); } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 - miopenBNFwdInferSpatialRunHost(/* alpha, beta, */ batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - keepRunningMeanVar, - runningMean_host.data(), - runningVariance_host.data()); + + batchNormSpatialHostInference(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + epsilon, + estMean.GetTensor(), + estVariance.GetTensor()); } else { @@ -936,59 +917,65 @@ void BatchNormDriver::runCPUFwdInference( } template -void BatchNormDriver::runCPUFwdTrain( - Tref epsilon, Tref eAF, int batch_sz, int channels, int height, int width, int depth) +void BatchNormDriver::runCPUFwdTrain(Tref epsilon, Tref eAF) { - + int size{0}; + miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size); + if(size == 5) + { + in.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc); + out_ref.desc = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc); + scale.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(scale.GetTensor().desc); + bias.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(bias.GetTensor().desc); + savedMean_ref.desc = miopen::BuildReshaped4DTensorDescriptor(savedMean_ref.desc); + savedVariance_ref.desc = miopen::BuildReshaped4DTensorDescriptor(savedVariance_ref.desc); + runMean_ref.desc = miopen::BuildReshaped4DTensorDescriptor(runMean_ref.desc); + runVariance_ref.desc = miopen::BuildReshaped4DTensorDescriptor(runVariance_ref.desc); + } if(bn_mode == miopenBNPerActivation) { // 1xCxHxW - miopenBNFwdTrainPerActivationRunHost(/* alpha, beta, */ batch_sz, - channels, -#if MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D - 1, - height * (isDepthSpecified ? depth : 1), -#else - (isDepthSpecified ? depth : 1), - height, -#endif - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - saveMeanVar, - keepRunningMeanVar, - saveMean_host.data(), - saveInvVariance_host.data(), - runningMean_host.data(), - runningVariance_host.data(), - eAF); + + batchNormPerActHostFwdTrain(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + savedMean_ref, + savedVariance_ref, + runMean_ref, + runVariance_ref); } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 - miopenBNFwdTrainSpatialRunHost(/* alpha, beta, */ batch_sz, - channels, -#if MIO_DRIVER_BN_REFERENCE_COMPUTE_3D_AS_2D - 1, - height * (isDepthSpecified ? depth : 1), -#else - (isDepthSpecified ? depth : 1), - height, -#endif - width, - in.data(), - out_host.data(), - scale_host.data(), - bias_host.data(), - epsilon, - saveMeanVar, - keepRunningMeanVar, - saveMean_host.data(), - saveInvVariance_host.data(), - runningMean_host.data(), - runningVariance_host.data(), - eAF); + + if(forw == 2 && !keepRunningMeanVar) + { + tensor empty_tensor; + batchNormSpatialHostFwdTrain(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + empty_tensor, // savedMean_ref + empty_tensor, // savedVariance_ref + empty_tensor, // runMean_ref + empty_tensor); // runVariance_ref + } + else + { + batchNormSpatialHostFwdTrain(in.GetTensor(), + out_ref, + scale.GetTensor(), + bias.GetTensor(), + static_cast(epsilon), + static_cast(eAF), + savedMean_ref, + savedVariance_ref, + runMean_ref, + runVariance_ref); + } } else { @@ -1001,35 +988,27 @@ void BatchNormDriver::runCPUFwdTrain( template int BatchNormDriver::RunForwardCPU() { - int nIn = 0, cIn = 0, dIn = 0, hIn = 0, wIn = 0; - - if(isDepthSpecified) - miopenGet5dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &dIn, &hIn, &wIn); - else - miopenGet4dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &hIn, &wIn); - - int batch_sz = nIn; - int channels = cIn; - int height = hIn; - int width = wIn; - int depth = dIn; - // T alpha = 0., beta = 0.; Tref epsilon = static_cast(EPSILON); Tref eAF = static_cast(1.0); - if(forw == 1) + if(forw == 1 || (forw == 2 && !keepRunningMeanVar)) { // training only for(int i = 0; i < inflags.GetValueInt("iter"); i++) { eAF = static_cast(1.0) / (static_cast(i) + static_cast(1.0)); - runCPUFwdTrain( - epsilon, eAF, /* alpha, beta,*/ batch_sz, channels, height, width, depth); + runCPUFwdTrain(epsilon, eAF /* alpha, beta,*/); } } - else if(forw == 2) - { // inference only - runCPUFwdInference(epsilon, /* alpha, beta,*/ batch_sz, channels, height, width, depth); + else if(forw == 2 && keepRunningMeanVar) + { + // inference only + runCPUFwdInference(epsilon); + } + else + { + printf("Unsupported forward cpu run state.\nExiting...\n\n"); + exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe) } return miopenStatusSuccess; @@ -1038,7 +1017,6 @@ int BatchNormDriver::RunForwardCPU() template int BatchNormDriver::RunBackwardGPU() { - if(!back) return miopenStatusSuccess; @@ -1058,47 +1036,53 @@ int BatchNormDriver::RunBackwardGPU() if(saveMeanVar) { - miopenBatchNormalizationBackward(GetHandle(), - bn_mode, - &alphaDataDiff, - &betaDataDiff, - &alphaParamDiff, - &betaParamDiff, - inputTensor, - in_dev->GetMem(), - dyInputTensor, - dyin_dev->GetMem(), - dxOutputTensor, - dxout_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - dscale_dev->GetMem(), - dbias_dev->GetMem(), - epsilon, - saveMean_dev->GetMem(), - saveInvVariance_dev->GetMem()); + miopenBatchNormalizationBackward_V2(GetHandle(), + bn_mode, + &alphaDataDiff, + &betaDataDiff, + &alphaParamDiff, + &betaParamDiff, + &in.GetTensor().desc, + in.GetDevicePtr(), + &dy.GetTensor().desc, + dy.GetDevicePtr(), + &out_bwd.GetTensor().desc, + out_bwd.GetDevicePtr(), + &bnScale.GetTensor().desc, + &dBias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedInvVar.GetTensor().desc, + bnScale.GetDevicePtr(), + dScale.GetDevicePtr(), + dBias.GetDevicePtr(), + epsilon, + savedMean.GetDevicePtr(), + savedInvVar.GetDevicePtr()); } else { - miopenBatchNormalizationBackward(GetHandle(), - bn_mode, - &alphaDataDiff, - &betaDataDiff, - &alphaParamDiff, - &betaParamDiff, - inputTensor, - in_dev->GetMem(), - dyInputTensor, - dyin_dev->GetMem(), - dxOutputTensor, - dxout_dev->GetMem(), - biasScaleTensor, - scale_dev->GetMem(), - dscale_dev->GetMem(), - dbias_dev->GetMem(), - epsilon, - nullptr, - nullptr); + miopenBatchNormalizationBackward_V2(GetHandle(), + bn_mode, + &alphaDataDiff, + &betaDataDiff, + &alphaParamDiff, + &betaParamDiff, + &in.GetTensor().desc, + in.GetDevicePtr(), + &dy.GetTensor().desc, + dy.GetDevicePtr(), + &out_bwd.GetTensor().desc, + out_bwd.GetDevicePtr(), + &bnScale.GetTensor().desc, + &dBias.GetTensor().desc, + &savedMean.GetTensor().desc, + &savedInvVar.GetTensor().desc, + bnScale.GetDevicePtr(), + dScale.GetDevicePtr(), + dBias.GetDevicePtr(), + epsilon, + nullptr, + nullptr); } miopen::deref(GetHandle()).Finish(); @@ -1120,13 +1104,11 @@ int BatchNormDriver::RunBackwardGPU() avgtime += time; int in_n, in_c, in_h, in_w; - std::tie(in_n, in_c, in_h, in_w) = - miopen::tien<4>(miopen::deref(inputTensor).GetLengths()); - size_t M = in_n * in_c * in_h * in_w; - size_t dataSz = - (M + 2 * in_c) * miopen::GetTypeSize(miopen::deref(inputTensor).GetType()); - float rdCnt = 2.0; - float wrCnt = 1.0; + std::tie(in_n, in_c, in_h, in_w) = miopen::tien<4>(in.GetTensor().desc.GetLengths()); + size_t M = in_n * in_c * in_h * in_w; + size_t dataSz = (M + 2 * in_c) * miopen::GetTypeSize(in.GetTensor().desc.GetType()); + float rdCnt = 2.0; + float wrCnt = 1.0; // layer, flopCnt, reads, writes, GFLOPS, GB/s, timeMs printf("stats: bnormb, 0, %zu, %zu, 0, %f, %f\n", dataSz, @@ -1176,27 +1158,28 @@ int BatchNormDriver::VerifyForward() if(keepRunningMeanVar) { // copy back for verification - runningMean_dev->FromGPU(GetStream(), runningMean.data()); - runningVariance_dev->FromGPU(GetStream(), runningVariance.data()); + runMean.CopyFromDeviceToHost(GetStream()); + runVariance.CopyFromDeviceToHost(GetStream()); + + auto errorRunMean = miopen::rms_range(runMean_ref.data, runMean.GetVector()); - auto errorRunMean = miopen::rms_range(runningMean_host, runningMean); if(!std::isfinite(errorRunMean) || errorRunMean > maxrms) { std::cout << "Forward train batch norm verification FAILED on running mean: " << errorRunMean << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < runningMean.size() && i < runningMean_host.size() && + for(int i = 0; i < runMean.GetVector().size() && i < runMean_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(runningMean[i]) - fabs(runningMean_host[i]))); + diff = fabs(Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "rm[" << i << "]: " << runningMean[i]; - std::cout << ", rm_host[" << i << "]: " << runningMean_host[i]; - std::cout << ", diff[" << i - << "]: " << Tmix(fabs(runningMean[i]) - fabs(runningMean_host[i])) + std::cout << "rm[" << i << "]: " << runMean.GetVector()[i]; + std::cout << ", rm_host[" << i << "]: " << runMean_ref.data[i]; + std::cout << ", diff[" << i << "]: " + << Tmix(fabs(runMean.GetVector()[i]) - fabs(runMean_ref.data[i])) << std::endl; } } @@ -1208,24 +1191,26 @@ int BatchNormDriver::VerifyForward() << errorRunMean << ')' << std::endl; } - auto errorRunVar = miopen::rms_range(runningVariance_host, runningVariance); + auto errorRunVar = miopen::rms_range(runVariance_ref.data, runVariance.GetVector()); if(!std::isfinite(errorRunVar) || errorRunVar > maxrms) { std::cout << "Forward train batch norm verification FAILED on running variance: " << errorRunVar << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < runningVariance.size() && i < runningVariance_host.size() && - i < MIO_BN_MAX_DEBUGLOOP; + for(int i = 0; i < runVariance.GetVector().size() && + i < runVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(runningVariance[i]) - fabs(runningVariance_host[i]))); + diff = fabs( + Tmix(fabs(runVariance.GetVector()[i]) - fabs(runVariance_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "rv[" << i << "]: " << runningVariance[i]; - std::cout << ", rv_host[" << i << "]: " << runningVariance_host[i]; + std::cout << "rv[" << i << "]: " << runVariance.GetVector()[i]; + std::cout << ", rv_host[" << i << "]: " << runVariance_ref.data[i]; std::cout << ", diff[" << i << "]: " - << Tmix(fabs(runningVariance[i]) - fabs(runningVariance_host[i])) + << Tmix(fabs(runVariance.GetVector()[i]) - + fabs(runVariance_ref.data[i])) << std::endl; } } @@ -1240,28 +1225,29 @@ int BatchNormDriver::VerifyForward() if(saveMeanVar) { // copy back for verification - saveMean_dev->FromGPU(GetStream(), saveMean.data()); - saveInvVariance_dev->FromGPU(GetStream(), saveInvVariance.data()); + savedMean.CopyFromDeviceToHost(GetStream()); + savedVariance.CopyFromDeviceToHost(GetStream()); maxval = static_cast(0.0); - auto errorSaveMean = miopen::rms_range(saveMean_host, saveMean); + auto errorSaveMean = miopen::rms_range(savedMean_ref.data, savedMean.GetVector()); if(!std::isfinite(errorSaveMean) || errorSaveMean > maxrms) { std::cout << "Forward train batch norm verification FAILED on saved mean: " << errorSaveMean << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; - i < saveMean.size() && i < saveMean_host.size() && i < MIO_BN_MAX_DEBUGLOOP; + for(int i = 0; i < savedMean.GetVector().size() && i < savedMean_ref.data.size() && + i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(saveMean[i]) - fabs(saveMean_host[i]))); + diff = fabs(Tmix(fabs(savedMean.GetVector()[i]) - fabs(savedMean_ref.data[i]))); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "sm[" << i << "]: " << saveMean[i]; - std::cout << ", sm_host[" << i << "]: " << saveMean_host[i]; - std::cout << ", diff[" << i - << "]: " << Tmix(fabs(saveMean[i]) - fabs(saveMean_host[i])) + std::cout << "sm[" << i << "]: " << savedMean.GetVector()[i]; + std::cout << ", sm_host[" << i << "]: " << savedMean_ref.data[i]; + std::cout << ", diff[" << i << "]: " + << Tmix(fabs(savedMean.GetVector()[i]) - + fabs(savedMean_ref.data[i])) << std::endl; } } @@ -1274,7 +1260,8 @@ int BatchNormDriver::VerifyForward() << errorSaveMean << ')' << std::endl; } - auto errorSaveVar = miopen::rms_range(saveInvVariance_host, saveInvVariance); + auto errorSaveVar = + miopen::rms_range(savedVariance_ref.data, savedVariance.GetVector()); if(!std::isfinite(errorSaveVar) || errorSaveVar > maxrms) { std::cout @@ -1282,17 +1269,19 @@ int BatchNormDriver::VerifyForward() << errorSaveVar << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < saveInvVariance.size() && i < saveInvVariance_host.size() && - i < MIO_BN_MAX_DEBUGLOOP; + for(int i = 0; i < savedVariance.GetVector().size() && + i < savedVariance_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(saveInvVariance[i]) - fabs(saveInvVariance_host[i]))); + diff = fabs( + Tmix(fabs(savedVariance.GetVector()[i]) - fabs(savedVariance_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "sv[" << i << "]: " << saveInvVariance[i]; - std::cout << ", sv_host[" << i << "]: " << saveInvVariance_host[i]; + std::cout << "sv[" << i << "]: " << savedVariance.GetVector()[i]; + std::cout << ", sv_host[" << i << "]: " << savedVariance_ref.data[i]; std::cout << ", diff[" << i << "]: " - << Tmix(fabs(saveInvVariance[i]) - fabs(saveInvVariance_host[i])) + << Tmix(fabs(savedVariance.GetVector()[i]) - + fabs(savedVariance_ref.data[i])) << std::endl; } } @@ -1307,38 +1296,40 @@ int BatchNormDriver::VerifyForward() } // end if(saveMeanVar) } - // Check output tensor error - out_dev->FromGPU(GetStream(), out.data()); + out.CopyFromDeviceToHost(GetStream()); + maxval = static_cast(0.0); - auto errorOut = miopen::rms_range(out_host, out); + auto errorOut = miopen::rms_range(out_ref.data, out.GetVector()); if(!std::isfinite(errorOut) || errorOut > maxrms) { std::cout << "Forward batch norm verification FAILED on output: " << errorOut << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) unsigned int count = 0; - for(int i = 0; i < out.size() && i < out_host.size(); i++) + for(int i = 0; i < out.GetVector().size() && i < out_ref.data.size(); i++) { - if(std::isnan(out[i])) + if(std::isnan(out.GetVector()[i])) { - std::cout << "out[" << i << "] produced a nan: " << out[i] << std::endl; + std::cout << "out[" << i << "] produced a nan: " << out.GetVector()[i] << std::endl; } - if(std::isnan(out_host[i])) + if(std::isnan(out_ref.data[i])) { - std::cout << "out_host[" << i << "] produced a nan: " << out_host[i] << std::endl; + std::cout << "out_ref[" << i << "] produced a nan: " << out_ref.data[i] + << std::endl; } - diff = Tref(fabs(out[i]) - fabs(out_host[i])); + diff = Tref(fabs(out.GetVector()[i]) - fabs(out_ref.data[i])); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "out[" << i << "]: " << out[i]; - std::cout << ", out_host[" << i << "]: " << out_host[i]; - std::cout << ", diff[" << i << "]: " << Tref(out[i] - out_host[i]) << std::endl; + std::cout << "out[" << i << "]: " << out.GetVector()[i]; + std::cout << ", out_ref.data[" << i << "]: " << out_ref.data[i]; + std::cout << ", diff[" << i << "]: " << Tref(out.GetVector()[i] - out_ref.data[i]) + << std::endl; count++; } } - std::cout << "Number of elements: " << out.size() << std::endl; + std::cout << "Number of elements: " << out.GetVector().size() << std::endl; std::cout << "Number of bad elements: " << count << std::endl; std::cout << "max difference in output: " << maxval << std::endl; #endif @@ -1365,61 +1356,77 @@ int BatchNormDriver::RunBackwardCPU() if(!back) return miopenStatusSuccess; - int nIn = 0, cIn = 0, dIn = 0, hIn = 0, wIn = 0; - if(isDepthSpecified) - miopenGet5dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &dIn, &hIn, &wIn); - else - miopenGet4dTensorDescriptorLengths(inputTensor, &nIn, &cIn, &hIn, &wIn); - - int batch_sz = nIn; - int channels = cIn; - int height = hIn; - int width = wIn; - int depth = dIn; - // T alphaDiff = 1, betaDiff = 0; // T alphaParam = 1, betaParam = 0; - Tref epsilon = static_cast(EPSILON); + double alpha = static_cast(1), beta = static_cast(0), + gamma = static_cast(1); + + // float alphaDataDiff = static_cast(1), betaDataDiff = static_cast(0); + // float alphaParamDiff = static_cast(1), betaParamDiff = static_cast(0); + int size{0}; + miopenGetTensorDescriptorSize(&in.GetTensor().desc, &size); + if(size == 5) + { + in.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(in.GetTensor().desc); + dy.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(dy.GetTensor().desc); + out_bwd.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(out_bwd.GetTensor().desc); + out_ref.desc = miopen::BuildReshaped4DTensorDescriptor(out_ref.desc); + bnScale.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(bnScale.GetTensor().desc); + dBias.GetTensor().desc = miopen::BuildReshaped4DTensorDescriptor(dBias.GetTensor().desc); + dScale_ref.desc = miopen::BuildReshaped4DTensorDescriptor(dScale_ref.desc); + dBias_ref.desc = miopen::BuildReshaped4DTensorDescriptor(dBias_ref.desc); + savedMean.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(savedMean.GetTensor().desc); + savedInvVar.GetTensor().desc = + miopen::BuildReshaped4DTensorDescriptor(savedInvVar.GetTensor().desc); + } if(bn_mode == miopenBNPerActivation) - { // 1xCxHxW - miopenBNBwdPerActivationRunHost(/* alphaDiff, betaDiff, alphaParam, - betaParam, */ - batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - dyin.data(), - dxout_host.data(), - scale.data(), - dscale_host.data(), - dbias_host.data(), - epsilon, - saveMeanVar, - saveMean_host.data(), - saveInvVariance_host.data()); + { + // 1xCxHxW + batchNormActivSpatialHostBwdTrain(activ_mode, + gamma, + beta, + alpha, + in.GetTensor(), + dy.GetTensor(), + out.GetTensor(), + out_ref, + bnScale.GetTensor(), + dBias.GetTensor(), + dScale_ref, + dBias_ref, + savedMean.GetTensor(), + savedInvVar.GetTensor()); } else if(bn_mode == miopenBNSpatial) - { // 1xCx1x1 - miopenBNBwdSpatialRunHost(/* alphaDiff, betaDiff, alphaParam, betaParam, - */ - batch_sz, - channels, - (isDepthSpecified ? depth : 1), - height, - width, - in.data(), - dyin.data(), - dxout_host.data(), - scale.data(), - dscale_host.data(), - dbias_host.data(), - epsilon, - saveMeanVar, - saveMean_host.data(), - saveInvVariance_host.data()); + { // 1xCx1x1 + if(saveMeanVar) + { + + batchNormSpatialHostBwdTrain(in.GetTensor(), + dy.GetTensor(), + out_ref, + bnScale.GetTensor(), + dScale_ref, + dBias_ref, + savedMean.GetTensor(), + savedInvVar.GetTensor()); + } + else + { + tensor empty_tensor; + batchNormSpatialHostBwdTrain(in.GetTensor(), + dy.GetTensor(), + out_ref, + bnScale.GetTensor(), + dScale_ref, + dBias_ref, + empty_tensor, + empty_tensor); + } } else { @@ -1443,33 +1450,36 @@ int BatchNormDriver::VerifyBackward() RunBackwardCPU(); - dxout_dev->FromGPU(GetStream(), dxout.data()); - dscale_dev->FromGPU(GetStream(), dscale.data()); - dbias_dev->FromGPU(GetStream(), dbias.data()); + out_bwd.CopyFromDeviceToHost(GetStream()); + dScale.CopyFromDeviceToHost(GetStream()); + dBias.CopyFromDeviceToHost(GetStream()); + #if(MIO_BN_DEBUG == 1) const Tref tolerance = static_cast(1000 * (sizeof(Tgpu) == 4) ? ERRTOL_FP32 : ERRTOL_FP16); Tref diff = static_cast(0.0); #endif maxval = static_cast(0.0); - auto errordxout = miopen::rms_range(dxout_host, dxout); + auto errordxout = miopen::rms_range(out_ref.data, out_bwd.GetVector()); if(!std::isfinite(errordxout) || errordxout > maxrms) { std::cout << "Backwards prop batch norm verification FAILED on dx: " << errordxout << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < dxout.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) + for(int i = 0; i < out_ref.data.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tgpu(fabs(dxout[i]) - fabs(dxout_host[i]))); + diff = fabs(Tgpu(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i]))); maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "dxout[" << i << "]: " << dxout[i]; - std::cout << "\tdxout_host[" << i << "]: " << dxout_host[i]; - std::cout << "\tdiff[" << i << "]: " << Tgpu(fabs(dxout[i]) - fabs(dxout_host[i])); + std::cout << "out_ref[" << i << "]: " << out_ref.data[i]; + std::cout << "\tout_bwd.GetVector()[" << i << "]: " << out_bwd.GetVector()[i]; + std::cout << "\tdiff[" << i + << "]: " << Tgpu(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])); std::cout << "\tratioH: " - << fabs(fabs(dxout[i]) - fabs(dxout_host[i])) / fabs(dxout_host[i]) + << fabs(fabs(out_ref.data[i]) - fabs(out_bwd.GetVector()[i])) / + fabs(out_bwd.GetVector()[i]) << std::endl; } } @@ -1483,25 +1493,26 @@ int BatchNormDriver::VerifyBackward() } maxval = static_cast(0.0); - auto errordscale = miopen::rms_range(dscale_host, dscale); + auto errordscale = miopen::rms_range(dScale_ref.data, dScale.GetVector()); if(!std::isfinite(errordscale) || errordscale > maxrms) { std::cout << "Backwards prop batch norm verification FAILED on dscale: " << errordscale << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < dscale.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) + for(int i = 0; i < dScale.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(dscale[i]) - fabs(dscale_host[i]))); - maxval = maxval < diff ? diff : maxval; + auto diff = fabs(Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i]))); + maxval = maxval < diff ? diff : maxval; if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "dscale[" << i << "]: " << dscale[i]; - std::cout << "\tdscale_host[" << i << "]: " << dscale_host[i]; + std::cout << "dscale[" << i << "]: " << dScale.GetVector()[i]; + std::cout << "\tdscale_host[" << i << "]: " << dScale_ref.data[i]; std::cout << "\tdiff[" << i - << "]: " << Tmix(fabs(dscale[i]) - fabs(dscale_host[i])); + << "]: " << Tmix(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])); std::cout << "\tratioH: " - << fabs(fabs(dscale[i]) - fabs(dscale_host[i])) / fabs(dscale_host[i]) + << fabs(fabs(dScale.GetVector()[i]) - fabs(dScale_ref.data[i])) / + fabs(dScale_ref.data[i]) << std::endl; } } @@ -1514,23 +1525,25 @@ int BatchNormDriver::VerifyBackward() << ')' << std::endl; } - auto errordbias = miopen::rms_range(dbias_host, dbias); + auto errordbias = miopen::rms_range(dBias_ref.data, dBias.GetVector()); if(!std::isfinite(errordbias) || errordbias > maxrms) { std::cout << "Backwards prop batch norm verification FAILED on dbias: " << errordbias << std::endl; anError = true; #if(MIO_BN_DEBUG == 1) - for(int i = 0; i < dbias.size() && i < MIO_BN_MAX_DEBUGLOOP; i++) + for(int i = 0; i < dBias.GetVector().size() && i < MIO_BN_MAX_DEBUGLOOP; i++) { - diff = fabs(Tmix(fabs(dbias[i]) - fabs(dbias_host[i]))); + diff = fabs(Tmix(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i]))); if(!std::isfinite(diff) || diff > tolerance) { - std::cout << "dbias[" << i << "]: " << dbias[i]; - std::cout << "\tdbias_host[" << i << "]: " << dbias_host[i]; - std::cout << "\tdiff[" << i << "]: " << Tmix(fabs(dbias[i]) - fabs(dbias_host[i])); + std::cout << "dbias[" << i << "]: " << dBias.GetVector()[i]; + std::cout << "\tdbias_host[" << i << "]: " << dBias_ref.data[i]; + std::cout << "\tdiff[" << i + << "]: " << Tmix(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i])); std::cout << "\tratioH: " - << fabs(fabs(dbias[i]) - fabs(dbias_host[i])) / fabs(dbias_host[i]) + << fabs(fabs(dBias.GetVector()[i]) - fabs(dBias_ref.data[i])) / + fabs(dBias_ref.data[i]) << std::endl; } } diff --git a/driver/conv_driver.hpp b/driver/conv_driver.hpp index 900f52f683..9d35f9a129 100644 --- a/driver/conv_driver.hpp +++ b/driver/conv_driver.hpp @@ -180,135 +180,6 @@ static inline miopenDataType_t DataTypeFromShortString(const std::string& type) } } -template -class GpumemTensor -{ - std::unique_ptr dev; - tensor host; - bool is_gpualloc = false; - -public: - void SetGpuallocMode(bool v) { is_gpualloc = v; } - tensor& GetTensor() { return host; } - - void AllocOnHost(miopenTensorDescriptor_t t) - { - host = tensor(miopen::deref(t)); - if(is_gpualloc) // We do not need host data. - { - host.data.clear(); - host.data.shrink_to_fit(); // To free host memory. - } - } - - std::vector& GetVector() - { - if(is_gpualloc) - MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in " - "'--gpualloc 1' mode"); - return host.data; - } - - Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); } - std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); } - - void - InitHostData(const size_t sz, // - const bool do_write, // If set to false, then only generate random data. This is - // necessary to reproduce values in input buffers even if some - // directions are skipped. For example, inputs for Backward - // will be the same for both "-F 0" and "-F 2". - std::function generator) - { - if(is_gpualloc) - { - /// In gpualloc mode, we do not care about reproducibility of results, because - /// validation is not used. Therefore, we do not have to always generate random value - /// (\ref move_rand) - return; - } - - for(size_t i = 0; i < sz; ++i) - { - /// \anchor move_rand - /// Generate random value, even if buffer is unused. This provides the same - /// initialization of input buffers regardless of which kinds of - /// convolutions are currently selectedfor testing (see the "-F" option). - /// Verification cache would be broken otherwise. - auto val = generator(); - if(do_write) - GetVector()[i] = val; - } - } - - status_t AllocOnDevice(stream, context_t ctx, const size_t sz) - { - dev = std::make_unique(ctx, sz, sizeof(Tgpu)); - return STATUS_SUCCESS; - } - - status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz) - { - AllocOnDevice(q, ctx, sz); - if(is_gpualloc) - { - /// \anchor gpualloc_random_init - /// In gpualloc mode, we do not want to leave input buffers uninitialized, because - /// there could be NaNs and Infs, which may affect the performance (which we are - /// interested to evaluate in this mode). Initialization with all 0's is not the - /// best choice as well, because GPU HW may optimize out computations with 0's and - /// that could affect performance of kernels too. That is why we are using - /// rocrand to initialize input buffers. - /// - /// However we do not care about precision in gpualloc mode, because validation - /// is not used. Therefore, range (0,1] is fine. - return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); - } - return dev->ToGPU(q, GetVectorData()); - } - - template - status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector&) - { - static_assert(std::is_same::value // - || std::is_same::value, // - "Before enabling more types, check thoroughly."); - dev = std::make_unique(ctx, sz, sizeof(T)); - return STATUS_SUCCESS; - } - - template - status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector& init) - { - AllocOnDevice(q, ctx, sz, init); - if(is_gpualloc) - { - /// \ref gpualloc_random_init - return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); - } - return dev->ToGPU(q, init.data()); - } - - status_t CopyFromDeviceToHost(stream q) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData()); - } - - template - status_t CopyFromDeviceToHost(stream q, tensor& t) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data()); - } - - template - status_t CopyFromDeviceToHost(stream q, std::vector& v) - { - return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data()); - } - - auto GetDevicePtr() -> auto { return dev->GetMem(); } -}; - template class GpumemVector { diff --git a/driver/dm_bnorm.cpp b/driver/dm_bnorm.cpp index c7bab90bb5..24e986fa1d 100644 --- a/driver/dm_bnorm.cpp +++ b/driver/dm_bnorm.cpp @@ -31,7 +31,11 @@ static Driver* makeDriver(const std::string& base_arg) if(base_arg == "bnorm") return new BatchNormDriver(); if(base_arg == "bnormfp16") + return new BatchNormDriver(); + if(base_arg == "bnormfp16fp32") return new BatchNormDriver(); + if(base_arg == "bnormbfp16fp32") + return new BatchNormDriver(); return nullptr; } diff --git a/driver/driver.hpp b/driver/driver.hpp index 4d33f7ed38..df0a85fece 100644 --- a/driver/driver.hpp +++ b/driver/driver.hpp @@ -38,6 +38,9 @@ #include #include #include +#include <../test/tensor_holder.hpp> +#include "util_driver.hpp" +#include "rocrand_wrapper.hpp" using half = half_float::half; using hip_bfloat16 = bfloat16; #include @@ -157,6 +160,140 @@ struct GPUMem #endif }; +template +class GpumemTensor +{ + std::unique_ptr dev; + tensor host; + bool is_gpualloc = false; + +public: + void SetGpuallocMode(bool v) { is_gpualloc = v; } + tensor& GetTensor() { return host; } + + void AllocOnHost(miopenTensorDescriptor_t t) + { + host = tensor(miopen::deref(t)); + if(is_gpualloc) // We do not need host data. + { + host.data.clear(); + host.data.shrink_to_fit(); // To free host memory. + } + } + template + void AllocOnHost(tensor t) + { + AllocOnHost(&t.desc); + } + + std::vector& GetVector() + { + if(is_gpualloc) + MIOPEN_THROW("[MIOpenDriver] GpumemTensor::GetVector should not be called in " + "'--gpualloc 1' mode"); + return host.data; + } + + Tgpu* GetVectorData() { return is_gpualloc ? nullptr : host.data.data(); } + std::size_t GetVectorSize() const { return is_gpualloc ? 0 : host.data.size(); } + + void + InitHostData(const size_t sz, // + const bool do_write, // If set to false, then only generate random data. This is + // necessary to reproduce values in input buffers even if some + // directions are skipped. For example, inputs for Backward + // will be the same for both "-F 0" and "-F 2". + std::function generator) + { + if(is_gpualloc) + { + /// In gpualloc mode, we do not care about reproducibility of results, because + /// validation is not used. Therefore, we do not have to always generate random value + /// (\ref move_rand) + return; + } + + for(size_t i = 0; i < sz; ++i) + { + /// \anchor move_rand + /// Generate random value, even if buffer is unused. This provides the same + /// initialization of input buffers regardless of which kinds of + /// convolutions are currently selectedfor testing (see the "-F" option). + /// Verification cache would be broken otherwise. + auto val = generator(); + if(do_write) + GetVector()[i] = val; + } + } + + status_t AllocOnDevice(stream, context_t ctx, const size_t sz) + { + dev = std::make_unique(ctx, sz, sizeof(Tgpu)); + return STATUS_SUCCESS; + } + + status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz) + { + AllocOnDevice(q, ctx, sz); + if(is_gpualloc) + { + /// \anchor gpualloc_random_init + /// In gpualloc mode, we do not want to leave input buffers uninitialized, because + /// there could be NaNs and Infs, which may affect the performance (which we are + /// interested to evaluate in this mode). Initialization with all 0's is not the + /// best choice as well, because GPU HW may optimize out computations with 0's and + /// that could affect performance of kernels too. That is why we are using + /// rocrand to initialize input buffers. + /// + /// However we do not care about precision in gpualloc mode, because validation + /// is not used. Therefore, range (0,1] is fine. + return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); + } + return dev->ToGPU(q, GetVectorData()); + } + + template + status_t AllocOnDevice(stream, context_t ctx, const size_t sz, std::vector&) + { + static_assert(std::is_same::value // + || std::is_same::value, // + "Before enabling more types, check thoroughly."); + dev = std::make_unique(ctx, sz, sizeof(T)); + return STATUS_SUCCESS; + } + + template + status_t AllocOnDeviceAndInit(stream q, context_t ctx, const size_t sz, std::vector& init) + { + AllocOnDevice(q, ctx, sz, init); + if(is_gpualloc) + { + /// \ref gpualloc_random_init + return gpumemrand::gen_0_1(static_cast(GetDevicePtr()), sz); + } + return dev->ToGPU(q, init.data()); + } + + status_t CopyFromDeviceToHost(stream q) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, GetVectorData()); + } + + template + status_t CopyFromDeviceToHost(stream q, tensor& t) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, t.data.data()); + } + + template + status_t CopyFromDeviceToHost(stream q, std::vector& v) + { + return is_gpualloc ? STATUS_SUCCESS : dev->FromGPU(q, v.data()); + } + + auto GetDevicePtr() -> auto { return dev->GetMem(); } +}; + inline void PadBufferSize(size_t& sz, int datatype_sz) { size_t page_sz = (2 * 1024 * 1024) / datatype_sz; @@ -194,16 +331,16 @@ inline std::string ParseBaseArg(int argc, char* argv[]) if(arg != "conv" && arg != "convfp16" && arg != "convint8" && arg != "convbfp16" && arg != "pool" && arg != "poolfp16" && arg != "lrn" && arg != "lrnfp16" && arg != "activ" && arg != "activfp16" && arg != "softmax" && arg != "softmaxfp16" && arg != "bnorm" && - arg != "bnormfp16" && arg != "rnn" && arg != "rnnfp16" && arg != "rnn_seq" && - arg != "rnn_seqfp16" && arg != "gemm" && arg != "gemmfp16" && arg != "ctc" && - arg != "dropout" && arg != "dropoutfp16" && arg != "tensorop" && arg != "reduce" && - arg != "reducefp16" && arg != "reducefp64" && arg != "layernorm" && arg != "layernormfp16" && - arg != "layernormbfp16" && arg != "sum" && arg != "sumfp16" && arg != "sumbfp16" && - arg != "groupnorm" && arg != "groupnormfp16" && arg != "groupnormbfp16" && arg != "cat" && - arg != "catfp16" && arg != "catbfp16" && arg != "addlayernorm" && - arg != "addlayernormfp16" && arg != "addlayernormbfp16" && arg != "t5layernorm" && - arg != "t5layernormfp16" && arg != "t5layernormbfp16" && arg != "adam" && - arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" && + arg != "bnormfp16" && arg != "bnormfp16fp32" && arg != "bnormbfp16fp32" && arg != "rnn" && + arg != "rnnfp16" && arg != "rnn_seq" && arg != "rnn_seqfp16" && arg != "gemm" && + arg != "gemmfp16" && arg != "ctc" && arg != "dropout" && arg != "dropoutfp16" && + arg != "tensorop" && arg != "reduce" && arg != "reducefp16" && arg != "reducefp64" && + arg != "layernorm" && arg != "layernormfp16" && arg != "layernormbfp16" && arg != "sum" && + arg != "sumfp16" && arg != "sumbfp16" && arg != "groupnorm" && arg != "groupnormfp16" && + arg != "groupnormbfp16" && arg != "cat" && arg != "catfp16" && arg != "catbfp16" && + arg != "addlayernorm" && arg != "addlayernormfp16" && arg != "addlayernormbfp16" && + arg != "t5layernorm" && arg != "t5layernormfp16" && arg != "t5layernormbfp16" && + arg != "adam" && arg != "adamfp16" && arg != "ampadam" && arg != "reduceextreme" && arg != "reduceextremefp16" && arg != "reduceextremebfp16" && arg != "adamw" && arg != "adamwfp16" && arg != "ampadamw" && arg != "transformersadamw" && arg != "transformersadamwfp16" && arg != "transformersampadamw" && arg != "getitem" && diff --git a/fin b/fin index 8c40a3c3b4..344cf42f6c 160000 --- a/fin +++ b/fin @@ -1 +1 @@ -Subproject commit 8c40a3c3b41a7d2fb31a8e747155fde4223919b9 +Subproject commit 344cf42f6c18f309f3d1dd08af1cd7b73dd38e46 diff --git a/include/miopen/miopen.h b/include/miopen/miopen.h index c919c95e0a..67652ab832 100644 --- a/include/miopen/miopen.h +++ b/include/miopen/miopen.h @@ -2737,6 +2737,67 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, double epsilon, void* resultSaveMean, void* resultSaveInvVariance); +/*! @brief Execute forward training layer for batch normalization + * + * Batch normalization pass for forward training pass. + * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale + * with their descriptor. + * + * If either resultSaveMean, or resultSaveInvVariance are null pointers then the values for the mean + * and inverse variance will not be used. + * + * Likewise, if either resultRunningMean, or resultRunningVariance are null pointers then the values + * for the running mean and variance will not be saved. + * Running averages and variances are scaled using an exponential averaging factor: \f[ + * \mu_{old} = \mu_{new}*factor + \mu_{old}*(1-factor) + * \f] + * where \f[ + * factor=1/(1+iteration) + * \f] + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alpha Floating point scaling factor, allocated on the host (input) + * @param beta Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param yDesc Tensor descriptor for output data tensor y (input) + * @param y Data tensor y (output) + * @param ScaleDesc Tensor descriptor for BN scaling + * @param biasVarDesc Tensor descriptor for BN bias + * @param savedMeanDesc Tensor descriptor for BN saved Mean + * @param savedVarDesc Tensor descriptor for BN saved Variance + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param bnBias Batch norm bias, beta, tensor (input) + * @param expAvgFactor Exponential averaging factor (input) + * @param resultRunningMean Running average saved for inference (output) + * @param resultRunningVariance Running variance saved for inference (output) + * @param epsilon Value to stablize inverse variance calculation (input) + * @param resultSaveMean Saved mini-batch mean for backwards pass (output) + * @param resultSaveInvVariance Saved mini-batch inverse variance for backwards pass (output) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t biasVarDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarDesc, + void* bnScale, + void* bnBias, + double expAvgFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance); /*! @brief Execute forward inference layer for batch normalization * @@ -2783,6 +2844,56 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, void* estimatedVariance, double epsilon); +/*! @brief Execute forward inference layer for batch normalization + * + * Batch normalization pass for forward inference pass. + * Takes in batch normalization mode bn_mode and input tensor x, output tensor y, bnBias and bnScale + * with their descriptor. + * + * If either estimatedMean, or estimatedVariance are null pointers then the values for the mean and + * variance will be calculated from input data and this calculated mean and variance will be used + * to update input values. + * If variance is zero and epsilon is also zero, this function outputs NAN values. Input espilon + * value should always be non zero positive value. + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alpha Floating point scaling factor, allocated on the host (input) + * @param beta Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param yDesc Tensor descriptor for output data tensor y (input) + * @param y Data tensor y (output) + * @param ScaleDesc Tensor descriptor for BN scaling + * @param biasVarDesc Tensor descriptor for BN bias + * @param estMeanDesc Tensor descriptor for BN estimated Mean + * @param estVarianceDesc Tensor descriptor for BN estimated Variance + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param bnBias Batch norm bias, beta, tensor (input) + * @param estimatedMean Running average saved during forward training (input) + * @param estimatedVariance Running variance saved during forward training (input) + * @param epsilon Value to stabilize inverse variance calculation (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t biasDesc, + const miopenTensorDescriptor_t estMeanDesc, + const miopenTensorDescriptor_t estVarianceDesc, + void* bnScale, + void* bnBias, + void* estimatedMean, + void* estimatedVariance, + double epsilon); + /*! @brief Execute backwards propagation layer for batch normalization * * Batch normalization pass for backwards propagation training pass. @@ -2838,6 +2949,68 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, const void* savedMean, const void* savedInvVariance); +/*! @brief Execute backwards propagation layer for batch normalization + * + * Batch normalization pass for backwards propagation training pass. + * The method for backwards propagation batch normalization. + * + * Takes in batch normalization mode bn_mode and input tensor data x, input activation tensor dy, + * output tensor dx, the learned tensors resultBNBiasDiff and resultBNScaleDiff with their + * descriptor. + * + * If BOTH savedMean, and savedVariance are not null pointers then the method will use the saved + * mean and variance calculated by the forward training phase. + * + * @param handle MIOpen handle (input) + * @param bn_mode Batch normalization mode (input) + * @param alphaDataDiff Floating point scaling factor, allocated on the host (input) + * @param betaDataDiff Floating point shift factor, allocated on the host (input) + * @param alphaParamDiff Floating point scaling factor, allocated on the host (input) + * @param betaParamDiff Floating point shift factor, allocated on the host (input) + * @param xDesc Tensor descriptor for data input tensor x (input) + * @param x Data tensor x (input) + * @param dyDesc Tensor descriptor for output data tensor y (input) + * @param dy Data tensor y (input) + * @param dxDesc Tensor descriptor for output data tensor dx (input) + * @param dx Data delta tensor dx (output) + * @param scaleDesc Tensor descriptor for scaling descriptor (input) + * @param biasDesc Tensor descriptor for bias/shift descriptor (input) + * @param savedMeanDesc Tensor descriptor for saved Mean descriptor (input) + * @param savedVarDesc Tensor descriptor for saved Variance descriptor (input) + * , shifting, saved variance and + * mean (input) + * @param bnScale Batch norm scaling, gamma, tensor (input) + * @param resultBnScaleDiff Tensor for dscale (output) + * @param resultBnBiasDiff Tensor for dbias (output) + * @param epsilon Value to stabilize inverse variance calculation (input) + * @param savedMean Saved mini-batch mean for backwards pass (input) + * @param savedInvVariance Saved mini-bathc inverse variance for backwards pass (input) + * @return miopenStatus_t + */ +MIOPEN_EXPORT miopenStatus_t +miopenBatchNormalizationBackward_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + const void* alphaDataDiff, + const void* betaDataDiff, + const void* alphaParamDiff, + const void* betaParamDiff, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t biasDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarDesc, + const void* bnScale, + void* resultBnScaleDiff, + void* resultBnBiasDiff, + double epsilon, + const void* savedMean, + const void* savedInvVariance); + /** @} */ // CLOSEOUT BATCHNORM DOXYGEN GROUP diff --git a/src/batch_norm.cpp b/src/batch_norm.cpp index a3c5f93e36..938809d81c 100644 --- a/src/batch_norm.cpp +++ b/src/batch_norm.cpp @@ -67,6 +67,20 @@ void DeriveBNTensorDescriptor(TensorDescriptor& derivedBnDesc, TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& tDesc) { auto dataType = tDesc.GetType(); + auto layout = tDesc.GetLayout_t(); + if(layout == miopenTensorNCDHW) + { + layout = miopenTensorNCHW; + } + else if(layout == miopenTensorNDHWC) + { + layout = miopenTensorNHWC; + } + else + { + std::cout << "Cannot handle layout : " << layout << "\n"; + exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe) + } std::vector dims(tDesc.GetLengths()); // NxCxDxHxW -> NxCx(D*H)xW @@ -74,7 +88,7 @@ TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& dims[3] = dims[4]; dims.pop_back(); - return {dataType, dims}; + return {dataType, layout, dims}; } void profileSequence(const Handle& handle, unsigned char select, float* ctime) diff --git a/src/batch_norm_api.cpp b/src/batch_norm_api.cpp index 8f184a9508..d3b824cee0 100644 --- a/src/batch_norm_api.cpp +++ b/src/batch_norm_api.cpp @@ -50,6 +50,7 @@ namespace miopen { namespace debug { void LogCmdBNorm(const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t sMeanDesc, miopenBatchNormMode_t bn_mode, const void* resultRunningMean, const void* resultRunningVariance, @@ -60,6 +61,7 @@ void LogCmdBNorm(const miopenTensorDescriptor_t xDesc, if(miopen::IsLoggingCmd()) { const std::string& str = BnormArgsForMIOpenDriver(xDesc, + sMeanDesc, bn_mode, resultRunningMean, resultRunningVariance, @@ -88,6 +90,130 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, void* estimatedMean, void* estimatedVariance, double epsilon) +{ + return miopenBatchNormalizationForwardInference_V2(handle, + bn_mode, + alpha, + beta, + xDesc, + x, + yDesc, + y, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScale, + bnBias, + estimatedMean, + estimatedVariance, + epsilon); +} + +extern "C" miopenStatus_t +miopenBatchNormalizationForwardTraining(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t bnScaleBiasMeanVarDesc, + void* bnScale, + void* bnBias, + double expAvgFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance) +{ + return miopenBatchNormalizationForwardTraining_V2(handle, + bn_mode, + alpha, + beta, + xDesc, + x, + yDesc, + y, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScaleBiasMeanVarDesc, + bnScale, + bnBias, + expAvgFactor, + resultRunningMean, + resultRunningVariance, + epsilon, + resultSaveMean, + resultSaveInvVariance); +} + +extern "C" miopenStatus_t +miopenBatchNormalizationBackward(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + const void* alphaDataDiff, + const void* betaDataDiff, + const void* alphaParamDiff, + const void* betaParamDiff, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t bnScaleBiasDiffDesc, + const void* bnScale, + void* resultBnScaleDiff, + void* resultBnBiasDiff, + double epsilon, + const void* savedMean, + const void* savedInvVariance) +{ + return miopenBatchNormalizationBackward_V2(handle, + bn_mode, + alphaDataDiff, + betaDataDiff, + alphaParamDiff, + betaParamDiff, + xDesc, + x, + dyDesc, + dy, + dxDesc, + dx, + bnScaleBiasDiffDesc, + bnScaleBiasDiffDesc, + bnScaleBiasDiffDesc, + bnScaleBiasDiffDesc, + bnScale, + resultBnScaleDiff, + resultBnBiasDiff, + epsilon, + savedMean, + savedInvVariance); +} + +extern "C" miopenStatus_t +miopenBatchNormalizationForwardInference_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t BiasDesc, + const miopenTensorDescriptor_t estMeanDesc, + const miopenTensorDescriptor_t estVarianceDesc, + void* bnScale, + void* bnBias, + void* estimatedMean, + void* estimatedVariance, + double epsilon) { MIOPEN_LOG_FUNCTION(handle, bn_mode, @@ -95,7 +221,10 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, x, yDesc, y, - bnScaleBiasMeanVarDesc, + scaleDesc, + BiasDesc, + estMeanDesc, + estVarianceDesc, bnScale, bnBias, estimatedMean, @@ -103,12 +232,14 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, epsilon); miopen::debug::LogCmdBNorm(xDesc, + estMeanDesc, bn_mode, estimatedMean, estimatedVariance, nullptr, nullptr, miopen::debug::BatchNormDirection_t::ForwardInference); + // In case of NxCxDxHxW int size{0}; miopenGetTensorDescriptorSize(xDesc, &size); @@ -124,9 +255,10 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(yDesc)) : miopen::deref(yDesc), DataCast(y), - (size == 5) - ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasMeanVarDesc)) - : miopen::deref(bnScaleBiasMeanVarDesc), + miopen::deref(scaleDesc), + miopen::deref(BiasDesc), + miopen::deref(estMeanDesc), + miopen::deref(estVarianceDesc), DataCast(bnScale), DataCast(bnBias), DataCast(estimatedMean), @@ -136,32 +268,37 @@ miopenBatchNormalizationForwardInference(miopenHandle_t handle, } extern "C" miopenStatus_t -miopenBatchNormalizationForwardTraining(miopenHandle_t handle, - miopenBatchNormMode_t bn_mode, - void* alpha, - void* beta, - const miopenTensorDescriptor_t xDesc, - const void* x, - const miopenTensorDescriptor_t yDesc, - void* y, - const miopenTensorDescriptor_t bnScaleBiasMeanVarDesc, - void* bnScale, - void* bnBias, - double expAvgFactor, - void* resultRunningMean, - void* resultRunningVariance, - double epsilon, - void* resultSaveMean, - void* resultSaveInvVariance) +miopenBatchNormalizationForwardTraining_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + void* alpha, + void* beta, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t yDesc, + void* y, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t BiasDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarianceDesc, + void* bnScale, + void* bnBias, + double expAvgFactor, + void* resultRunningMean, + void* resultRunningVariance, + double epsilon, + void* resultSaveMean, + void* resultSaveInvVariance) { - MIOPEN_LOG_FUNCTION(handle, bn_mode, xDesc, x, yDesc, y, - bnScaleBiasMeanVarDesc, + scaleDesc, + BiasDesc, + savedMeanDesc, + savedVarianceDesc, bnScale, bnBias, expAvgFactor, @@ -172,6 +309,7 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, resultSaveInvVariance); miopen::debug::LogCmdBNorm(xDesc, + savedMeanDesc, bn_mode, resultRunningMean, resultRunningVariance, @@ -193,9 +331,10 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(yDesc)) : miopen::deref(yDesc), DataCast(y), - (size == 5) - ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasMeanVarDesc)) - : miopen::deref(bnScaleBiasMeanVarDesc), + miopen::deref(scaleDesc), + miopen::deref(BiasDesc), + miopen::deref(savedMeanDesc), + miopen::deref(savedVarianceDesc), DataCast(bnScale), DataCast(bnBias), expAvgFactor, @@ -208,27 +347,29 @@ miopenBatchNormalizationForwardTraining(miopenHandle_t handle, } extern "C" miopenStatus_t -miopenBatchNormalizationBackward(miopenHandle_t handle, - miopenBatchNormMode_t bn_mode, - const void* alphaDataDiff, - const void* betaDataDiff, - const void* alphaParamDiff, - const void* betaParamDiff, - const miopenTensorDescriptor_t xDesc, - const void* x, - const miopenTensorDescriptor_t dyDesc, - const void* dy, - const miopenTensorDescriptor_t dxDesc, - void* dx, - const miopenTensorDescriptor_t bnScaleBiasDiffDesc, - const void* bnScale, - void* resultBnScaleDiff, - void* resultBnBiasDiff, - double epsilon, - const void* savedMean, - const void* savedInvVariance) +miopenBatchNormalizationBackward_V2(miopenHandle_t handle, + miopenBatchNormMode_t bn_mode, + const void* alphaDataDiff, + const void* betaDataDiff, + const void* alphaParamDiff, + const void* betaParamDiff, + const miopenTensorDescriptor_t xDesc, + const void* x, + const miopenTensorDescriptor_t dyDesc, + const void* dy, + const miopenTensorDescriptor_t dxDesc, + void* dx, + const miopenTensorDescriptor_t scaleDesc, + const miopenTensorDescriptor_t BiasDesc, + const miopenTensorDescriptor_t savedMeanDesc, + const miopenTensorDescriptor_t savedVarianceDesc, + const void* bnScale, + void* resultBnScaleDiff, + void* resultBnBiasDiff, + double epsilon, + const void* savedMean, + const void* savedInvVariance) { - MIOPEN_LOG_FUNCTION(handle, bn_mode, xDesc, @@ -237,7 +378,10 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, dy, dxDesc, dx, - bnScaleBiasDiffDesc, + scaleDesc, + BiasDesc, + savedMeanDesc, + savedVarianceDesc, bnScale, resultBnScaleDiff, resultBnBiasDiff, @@ -245,6 +389,7 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, savedMean, savedInvVariance); miopen::debug::LogCmdBNorm(xDesc, + savedMeanDesc, bn_mode, nullptr, nullptr, @@ -271,9 +416,10 @@ miopenBatchNormalizationBackward(miopenHandle_t handle, (size == 5) ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(dxDesc)) : miopen::deref(dxDesc), DataCast(dx), - (size == 5) - ? miopen::BuildReshaped4DTensorDescriptor(miopen::deref(bnScaleBiasDiffDesc)) - : miopen::deref(bnScaleBiasDiffDesc), + miopen::deref(scaleDesc), + miopen::deref(BiasDesc), + miopen::deref(savedMeanDesc), + miopen::deref(savedVarianceDesc), DataCast(bnScale), DataCast(resultBnScaleDiff), DataCast(resultBnBiasDiff), diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp index c105996d57..e75ec31902 100644 --- a/src/driver_arguments.cpp +++ b/src/driver_arguments.cpp @@ -60,12 +60,28 @@ void ConvDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc) } } -void BnDataType(std::stringstream& ss, const miopen::TensorDescriptor& desc) +// test based on the input tensor and scaleMean. +// We choose scaleMean because its a accumulator type. +void BnDataType(std::stringstream& ss, + const miopen::TensorDescriptor& xDesc, + const miopen::TensorDescriptor& sMeanDesc) { - if(desc.GetType() == miopenHalf) + if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenHalf) { ss << "bnormfp16"; } + else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenBFloat16) + { + ss << "bnormbfp16"; + } + else if(xDesc.GetType() == miopenHalf && sMeanDesc.GetType() == miopenFloat) + { + ss << "bnormfp16fp32"; + } + else if(xDesc.GetType() == miopenBFloat16 && sMeanDesc.GetType() == miopenFloat) + { + ss << "bnormbfp16fp32"; + } else { ss << "bnorm"; @@ -211,7 +227,8 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc, return ss.str(); } -std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, +std::string BnormArgsForMIOpenDriver(const miopenTensorDescriptor_t xDesc, + const miopenTensorDescriptor_t sMeanDesc, miopenBatchNormMode_t bn_mode, const void* resultRunningMean, const void* resultRunningVariance, @@ -224,7 +241,7 @@ std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, miopenGetTensorDescriptorSize(xDesc, &size); std::stringstream ss; if(print_for_bn_driver) - BnDataType(ss, miopen::deref(xDesc)); + BnDataType(ss, miopen::deref(xDesc), miopen::deref(sMeanDesc)); ss << " -n " << miopen::deref(xDesc).GetLengths()[0] // clang-format off << " -c " << miopen::deref(xDesc).GetLengths()[1]; @@ -248,6 +265,7 @@ std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, resultRunningVariance, resultSaveMean, resultSaveInvVariance); + ss << " --layout " << miopen::deref(xDesc).GetLayout_str(); } return ss.str(); } diff --git a/src/fusion.cpp b/src/fusion.cpp index a9ef5e27a9..e536f6a1a1 100644 --- a/src/fusion.cpp +++ b/src/fusion.cpp @@ -390,6 +390,7 @@ std::string LogCmdBnormFusion(const miopenFusionPlanDescriptor_t fusePlanDesc, i if(bn_op != nullptr) { str += BnormArgsForMIOpenDriver(&bn_op->input_desc, + &bn_op->base_desc, bn_op->mode, nullptr, nullptr, diff --git a/src/include/miopen/batch_norm.hpp b/src/include/miopen/batch_norm.hpp index 50c309550c..92444f039b 100644 --- a/src/include/miopen/batch_norm.hpp +++ b/src/include/miopen/batch_norm.hpp @@ -163,40 +163,44 @@ void bnFwdTrainSelectMulti(const Handle& handle, void profileSequence(const Handle& handle, unsigned char select, float* ctime); -MIOPEN_INTERNALS_EXPORT void -BatchNormForwardInference(Handle& handle, - miopenBatchNormMode_t bn_mode, - const void* alpha, - const void* beta, - const TensorDescriptor& xDesc, - ConstData_t x, - const TensorDescriptor& yDesc, - Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, - ConstData_t bnScale, - ConstData_t bnBias, - ConstData_t estimatedMean, - ConstData_t estimatedVariance, - double epsilon); - -MIOPEN_INTERNALS_EXPORT void -BatchNormForwardTraining(Handle& handle, - miopenBatchNormMode_t bn_mode, - const void* alpha, /* these don't seem to be used in conv */ - const void* beta, - const TensorDescriptor& xDesc, - ConstData_t x, - const TensorDescriptor& yDesc, - Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, - ConstData_t bnScale, - ConstData_t bnBias, - double expAvgFactor, - Data_t resultRunningMean, - Data_t resultRunningVariance, - double epsilon, - Data_t resultSaveMean, - Data_t resultSaveInvVariance); +MIOPEN_INTERNALS_EXPORT void BatchNormForwardInference(Handle& handle, + miopenBatchNormMode_t bn_mode, + const void* alpha, + const void* beta, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& BiasDesc, + const TensorDescriptor& estMeanDesc, + const TensorDescriptor& estVarianceDesc, + ConstData_t bnScale, + ConstData_t bnBias, + ConstData_t estimatedMean, + ConstData_t estimatedVariance, + double epsilon); + +MIOPEN_INTERNALS_EXPORT void BatchNormForwardTraining(Handle& handle, + miopenBatchNormMode_t bn_mode, + const void* alpha, + const void* beta, + const TensorDescriptor& xDesc, + ConstData_t x, + const TensorDescriptor& yDesc, + Data_t y, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, + ConstData_t bnScale, + ConstData_t bnBias, + double expAvgFactor, + Data_t resultRunningMean, + Data_t resultRunningVariance, + double epsilon, + Data_t resultSaveMean, + Data_t resultSaveInvVariance); MIOPEN_INTERNALS_EXPORT void BatchNormBackward(Handle& handle, miopenBatchNormMode_t bn_mode, @@ -210,7 +214,10 @@ MIOPEN_INTERNALS_EXPORT void BatchNormBackward(Handle& handle, ConstData_t dy, const TensorDescriptor& dxDesc, Data_t dx, - const TensorDescriptor& bnScaleBiasDiffDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& BiasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, ConstData_t bnScale, Data_t resultBnScaleDiff, Data_t resultBnBiasDiff, diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp index b87494b725..d28e91adfd 100644 --- a/src/include/miopen/batchnorm/problem_description.hpp +++ b/src/include/miopen/batchnorm/problem_description.hpp @@ -58,7 +58,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob ProblemDescription(miopenBatchNormMode_t bn_mode_, const TensorDescriptor& xDesc_, const TensorDescriptor& yDesc_, - const TensorDescriptor& bnScaleBiasMeanVarDesc_, + const TensorDescriptor& scaleDesc_, + const TensorDescriptor& biasDesc_, + const TensorDescriptor& sMeanDesc_, + const TensorDescriptor& sVarianceDesc_, double expAvgFactor_, double epsilon_, bool resultsave_, @@ -67,7 +70,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob bn_mode(bn_mode_), xDesc(xDesc_), yOrDyDesc(yDesc_), - scaleBiasDesc(bnScaleBiasMeanVarDesc_), + scaleDesc(scaleDesc_), + biasDesc(biasDesc_), + sMeanDesc(sMeanDesc_), + sVarianceDesc(sVarianceDesc_), expAvgFactor(expAvgFactor_), epsilon(epsilon_), resultsave(resultsave_), @@ -82,13 +88,19 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob ProblemDescription(miopenBatchNormMode_t bn_mode_, const TensorDescriptor& xDesc_, const TensorDescriptor& yDesc_, - const TensorDescriptor& bnScaleBiasMeanVarDesc_, + const TensorDescriptor& scaleDesc_, + const TensorDescriptor& biasDesc_, + const TensorDescriptor& sMeanDesc_, + const TensorDescriptor& sVarianceDesc_, double epsilon_) : direction(Direction::ForwardInference), bn_mode(bn_mode_), xDesc(xDesc_), yOrDyDesc(yDesc_), - scaleBiasDesc(bnScaleBiasMeanVarDesc_), + scaleDesc(scaleDesc_), + biasDesc(biasDesc_), + sMeanDesc(sMeanDesc_), + sVarianceDesc(sVarianceDesc_), epsilon(epsilon_) { SetSpatialDims(); @@ -101,7 +113,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob const TensorDescriptor& xDesc_, const TensorDescriptor& dyDesc_, const TensorDescriptor& dxDesc_, - const TensorDescriptor& bnScaleBiasDiffDesc_, + const TensorDescriptor& scaleDesc_, + const TensorDescriptor& biasDesc_, + const TensorDescriptor& sMeanDesc_, + const TensorDescriptor& sVarianceDesc_, double epsilon_, bool useSaved_) : direction(Direction::Backward), @@ -109,7 +124,10 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob xDesc(xDesc_), yOrDyDesc(dyDesc_), dxDesc(dxDesc_), - scaleBiasDesc(bnScaleBiasDiffDesc_), + scaleDesc(scaleDesc_), + biasDesc(biasDesc_), + sMeanDesc(sMeanDesc_), + sVarianceDesc(sVarianceDesc_), epsilon(epsilon_), useSaved(useSaved_) { @@ -153,13 +171,13 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob const TensorDescriptor& GetBnScaleBiasMeanVarDesc() const { assert(direction == Direction::ForwardTraining || direction == Direction::ForwardInference); - return scaleBiasDesc; + return scaleDesc; } const TensorDescriptor& GetScaleBiasDiffDesc() const { assert(direction == Direction::Backward); - return scaleBiasDesc; + return scaleDesc; } bool GetResultSave() const @@ -215,7 +233,11 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, Prob TensorDescriptor xDesc; // input TensorDescriptor yOrDyDesc; // output TensorDescriptor dxDesc; - TensorDescriptor scaleBiasDesc; + + TensorDescriptor scaleDesc; // scale + TensorDescriptor biasDesc; // bias (shift) + TensorDescriptor sMeanDesc; + TensorDescriptor sVarianceDesc; #ifdef __clang__ #pragma clang diagnostic push diff --git a/src/include/miopen/driver_arguments.hpp b/src/include/miopen/driver_arguments.hpp index da4064b7f0..a964e7fe27 100644 --- a/src/include/miopen/driver_arguments.hpp +++ b/src/include/miopen/driver_arguments.hpp @@ -67,6 +67,7 @@ std::string ConvArgsForMIOpenDriver(const miopen::TensorDescriptor& xDesc, bool print_for_conv_driver = true); std::string BnormArgsForMIOpenDriver(miopenTensorDescriptor_t xDesc, + miopenTensorDescriptor_t sMeanDesc, miopenBatchNormMode_t bn_mode, const void* resultRunningMean, const void* resultRunningVariance, diff --git a/src/include/miopen/fusion/problem_description.hpp b/src/include/miopen/fusion/problem_description.hpp index bcb37878d9..b3d1669cee 100644 --- a/src/include/miopen/fusion/problem_description.hpp +++ b/src/include/miopen/fusion/problem_description.hpp @@ -128,7 +128,14 @@ struct FusionDescription : ProblemDescriptionBase dynamic_cast(*fusion_plan_desc->op_map[idx]); miopen::TensorDescriptor out_desc; bn_op.GetOutputDesc(out_desc); - return {bn_op.mode, bn_op.input_desc, out_desc, bn_op.base_desc, not_used}; + return {bn_op.mode, + bn_op.input_desc, + out_desc, + bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, + not_used}; } else if(dir == miopen::batchnorm::Direction::ForwardTraining) { @@ -140,6 +147,9 @@ struct FusionDescription : ProblemDescriptionBase bn_op.input_desc, out_desc, bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, + bn_op.base_desc, not_used, // expAvgFactor filler not_used, true /* resultSave*/, @@ -156,6 +166,9 @@ struct FusionDescription : ProblemDescriptionBase out_desc, bn_op.input_desc, {} /*bn_op.base_desc*/, + {} /*bn_op.base_desc*/, + {} /*bn_op.base_desc*/, + {} /*bn_op.base_desc*/, not_used, bn_op.useBatchStats /*useSaved*/}; } diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 40bcd34935..f33c5ac5db 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -55,6 +55,8 @@ miopen::PerformanceDb GetDb(const miopen::ExecutionContext& ctx, } } // namespace batchnorm +//============ BEGIN FORWARD TRAINING =============== + void BatchNormForwardTraining(Handle& handle, miopenBatchNormMode_t bn_mode, const void* alpha, @@ -63,7 +65,10 @@ void BatchNormForwardTraining(Handle& handle, ConstData_t x, const TensorDescriptor& yDesc, Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, ConstData_t bnScale, ConstData_t bnBias, double expAvgFactor, @@ -73,13 +78,14 @@ void BatchNormForwardTraining(Handle& handle, Data_t resultSaveMean, Data_t resultSaveInvVariance) { - if(x == nullptr || y == nullptr || bnScale == nullptr || bnBias == nullptr) { MIOPEN_THROW(miopenStatusBadParm); } - if(xDesc.GetNumDims() != yDesc.GetNumDims() || - xDesc.GetNumDims() != bnScaleBiasMeanVarDesc.GetNumDims()) + if(xDesc.GetNumDims() != yDesc.GetNumDims() || xDesc.GetNumDims() != scaleDesc.GetNumDims() || + xDesc.GetNumDims() != biasDesc.GetNumDims() || + xDesc.GetNumDims() != savedMeanDesc.GetNumDims() || + xDesc.GetNumDims() != savedVarianceDesc.GetNumDims()) { MIOPEN_THROW(miopenStatusBadParm); } @@ -105,9 +111,9 @@ void BatchNormForwardTraining(Handle& handle, { miopen::checkNumericsInput(handle, xDesc, x); if(bnScale != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnScale); + miopen::checkNumericsInput(handle, scaleDesc, bnScale); if(bnBias != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnBias); + miopen::checkNumericsInput(handle, biasDesc, bnBias); } const auto resultsave = resultSaveMean != nullptr && resultSaveInvVariance != nullptr; @@ -116,7 +122,10 @@ void BatchNormForwardTraining(Handle& handle, const auto problem = batchnorm::ProblemDescription{bn_mode, xDesc, yDesc, - bnScaleBiasMeanVarDesc, + scaleDesc, + biasDesc, + savedMeanDesc, + savedVarianceDesc, expAvgFactor, epsilon, resultsave, @@ -153,15 +162,16 @@ void BatchNormForwardTraining(Handle& handle, { miopen::checkNumericsOutput(handle, yDesc, y); if(resultRunningMean != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultRunningMean); + miopen::checkNumericsOutput(handle, savedMeanDesc, resultRunningMean); if(resultRunningVariance != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultRunningVariance); + miopen::checkNumericsOutput(handle, savedVarianceDesc, resultRunningVariance); if(resultSaveMean != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultSaveMean); + miopen::checkNumericsOutput(handle, savedMeanDesc, resultSaveMean); if(resultSaveInvVariance != nullptr) - miopen::checkNumericsOutput(handle, bnScaleBiasMeanVarDesc, resultSaveInvVariance); + miopen::checkNumericsOutput(handle, savedVarianceDesc, resultSaveInvVariance); } } + //================== END FWD TRAIN =================== //============ BEGIN FORWARD INFERENCE =============== @@ -173,31 +183,37 @@ void BatchNormForwardInference(Handle& handle, ConstData_t x, const TensorDescriptor& yDesc, Data_t y, - const TensorDescriptor& bnScaleBiasMeanVarDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& estMeanDesc, + const TensorDescriptor& estVarianceDesc, ConstData_t bnScale, ConstData_t bnBias, ConstData_t estimatedMean, ConstData_t estimatedVariance, double epsilon) { + if(miopen::CheckNumericsEnabled()) { miopen::checkNumericsInput(handle, xDesc, x); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnScale); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, bnBias); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, estimatedMean); - miopen::checkNumericsInput(handle, bnScaleBiasMeanVarDesc, estimatedVariance); + miopen::checkNumericsInput(handle, scaleDesc, bnScale); + miopen::checkNumericsInput(handle, biasDesc, bnBias); + miopen::checkNumericsInput(handle, estMeanDesc, estimatedMean); + miopen::checkNumericsInput(handle, estVarianceDesc, estimatedVariance); } if(estimatedMean != nullptr && estimatedVariance != nullptr) { - if(x == nullptr || y == nullptr || bnScale == nullptr || bnBias == nullptr) { MIOPEN_THROW(miopenStatusBadParm); } if(xDesc.GetNumDims() != yDesc.GetNumDims() || - xDesc.GetNumDims() != bnScaleBiasMeanVarDesc.GetNumDims()) + xDesc.GetNumDims() != scaleDesc.GetNumDims() || + xDesc.GetNumDims() != biasDesc.GetNumDims() || + xDesc.GetNumDims() != estMeanDesc.GetNumDims() || + xDesc.GetNumDims() != estVarianceDesc.GetNumDims()) { MIOPEN_THROW(miopenStatusBadParm); } @@ -216,8 +232,8 @@ void BatchNormForwardInference(Handle& handle, MIOPEN_THROW(miopenStatusBadParm); } - const auto problem = - batchnorm::ProblemDescription{bn_mode, xDesc, yDesc, bnScaleBiasMeanVarDesc, epsilon}; + const auto problem = batchnorm::ProblemDescription{ + bn_mode, xDesc, yDesc, scaleDesc, biasDesc, estMeanDesc, estVarianceDesc, epsilon}; const auto invoke_params = [&]() { auto tmp = batchnorm::InfInvokeParams{}; @@ -250,7 +266,10 @@ void BatchNormForwardInference(Handle& handle, x, yDesc, y, - bnScaleBiasMeanVarDesc, + scaleDesc, + biasDesc, + estMeanDesc, + estVarianceDesc, bnScale, bnBias, 0, @@ -265,9 +284,11 @@ void BatchNormForwardInference(Handle& handle, miopen::checkNumericsOutput(handle, yDesc, y); } } + //================= END FORWARD INFERENCE ==================== //=============== BEGIN BACKWARDS PROPAGATION ================ + void BatchNormBackward(Handle& handle, miopenBatchNormMode_t bn_mode, const void* alphaDataDiff, @@ -280,7 +301,10 @@ void BatchNormBackward(Handle& handle, ConstData_t dy, const TensorDescriptor& dxDesc, Data_t dx, - const TensorDescriptor& bnScaleBiasDiffDesc, + const TensorDescriptor& scaleDesc, + const TensorDescriptor& biasDesc, + const TensorDescriptor& savedMeanDesc, + const TensorDescriptor& savedVarianceDesc, ConstData_t bnScale, Data_t resultBnScaleDiff, Data_t resultBnBiasDiff, @@ -296,20 +320,23 @@ void BatchNormBackward(Handle& handle, { miopen::checkNumericsInput(handle, xDesc, x); miopen::checkNumericsInput(handle, dyDesc, dy); - miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, bnScale); + miopen::checkNumericsInput(handle, scaleDesc, bnScale); + miopen::checkNumericsInput(handle, biasDesc, bnScale); if(savedMean != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, savedMean); + miopen::checkNumericsInput(handle, savedMeanDesc, savedMean); if(savedInvVariance != nullptr) - miopen::checkNumericsInput(handle, bnScaleBiasDiffDesc, savedInvVariance); + miopen::checkNumericsInput(handle, savedVarianceDesc, savedInvVariance); } if(x == nullptr || dy == nullptr || bnScale == nullptr || dx == nullptr) { MIOPEN_THROW(miopenStatusBadParm); } - if(xDesc.GetNumDims() != dyDesc.GetNumDims() || - xDesc.GetNumDims() != bnScaleBiasDiffDesc.GetNumDims()) + if(xDesc.GetNumDims() != dyDesc.GetNumDims() || xDesc.GetNumDims() != scaleDesc.GetNumDims() || + xDesc.GetNumDims() != biasDesc.GetNumDims() || + xDesc.GetNumDims() != savedMeanDesc.GetNumDims() || + xDesc.GetNumDims() != savedVarianceDesc.GetNumDims()) { MIOPEN_THROW(miopenStatusBadParm); } @@ -336,8 +363,16 @@ void BatchNormBackward(Handle& handle, const auto useSaved = savedMean != nullptr && savedInvVariance != nullptr; - const auto problem = batchnorm::ProblemDescription{ - bn_mode, xDesc, dyDesc, dxDesc, bnScaleBiasDiffDesc, epsilon, useSaved}; + const auto problem = batchnorm::ProblemDescription{bn_mode, + xDesc, + dyDesc, + dxDesc, + scaleDesc, + biasDesc, + savedMeanDesc, + savedVarianceDesc, + epsilon, + useSaved}; const auto algo = bn_mode == miopenBNSpatial ? AlgorithmName{"miopenBatchNormBackwardPropSpatial"} @@ -368,8 +403,8 @@ void BatchNormBackward(Handle& handle, if(miopen::CheckNumericsEnabled()) { miopen::checkNumericsOutput(handle, dxDesc, dx); - miopen::checkNumericsOutput(handle, bnScaleBiasDiffDesc, resultBnScaleDiff); - miopen::checkNumericsOutput(handle, bnScaleBiasDiffDesc, resultBnBiasDiff); + miopen::checkNumericsOutput(handle, scaleDesc, resultBnScaleDiff); + miopen::checkNumericsOutput(handle, biasDesc, resultBnBiasDiff); } } } // namespace miopen diff --git a/src/solver/batchnorm/backward_ck.cpp b/src/solver/batchnorm/backward_ck.cpp index 7769e4d563..bca7afc3a5 100644 --- a/src/solver/batchnorm/backward_ck.cpp +++ b/src/solver/batchnorm/backward_ck.cpp @@ -201,7 +201,8 @@ bool BnCKBwdBackward::IsApplicable( return false; if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward) return false; - + if(!bn_problem.Is2D()) + return false; switch(bn_problem.GetXDesc().GetType()) { case miopenFloat: return CheckCKApplicability(bn_problem); diff --git a/src/solver/batchnorm/backward_per_activation.cpp b/src/solver/batchnorm/backward_per_activation.cpp index 93cf670194..af52fbc339 100644 --- a/src/solver/batchnorm/backward_per_activation.cpp +++ b/src/solver/batchnorm/backward_per_activation.cpp @@ -41,6 +41,8 @@ namespace batchnorm { bool BnBwdTrainingPerActivation::IsApplicable( const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const { + if(!problem.Is2D()) + return false; return problem.GetDirection() == miopen::batchnorm::Direction::Backward && problem.GetMode() == miopenBNPerActivation; } diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp index 29bbd5dba9..7fa9c0f89a 100644 --- a/src/solver/batchnorm/backward_spatial_multiple.cpp +++ b/src/solver/batchnorm/backward_spatial_multiple.cpp @@ -44,6 +44,10 @@ bool BnBwdTrainingSpatialMultiple::IsApplicable( if(problem.GetDirection() != miopen::batchnorm::Direction::Backward || problem.GetMode() != miopenBNSpatial) return false; + if(!problem.Is2D()) + { + return false; + } #if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR if(problem.GetXDesc().GetType() == miopenHalf && diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp index 30b0c0495f..86fa5a68c7 100644 --- a/src/solver/batchnorm/backward_spatial_single.cpp +++ b/src/solver/batchnorm/backward_spatial_single.cpp @@ -45,6 +45,8 @@ bool BnBwdTrainingSpatialSingle::IsApplicable( if(problem.GetDirection() != miopen::batchnorm::Direction::Backward || problem.GetMode() != miopenBNSpatial) return false; + if(!problem.Is2D()) + return false; #if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR if(problem.GetXDesc().GetType() == miopenHalf && diff --git a/test/bn_3d_peract_test.cpp b/test/bn_3d_peract_test.cpp index 19fd15e7ce..c5f96ff9ba 100644 --- a/test/bn_3d_peract_test.cpp +++ b/test/bn_3d_peract_test.cpp @@ -281,6 +281,9 @@ struct verify_forward_train_3d_bn_per_activation BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -450,6 +453,9 @@ struct verify_forward_infer_3d_bn_per_activation_recalc BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), nullptr, @@ -573,6 +579,9 @@ struct verify_forward_infer_3d_bn_per_activation_use_est BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), + BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -747,6 +756,9 @@ struct verify_backward_3d_bn_per_activation_use_saved BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -948,6 +960,9 @@ struct verify_backward_3d_bn_per_activation_recalc BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), BuildReshaped4DTensorDescriptor(scale.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), + BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), diff --git a/test/bn_3d_spatial_test.cpp b/test/bn_3d_spatial_test.cpp index 8d428fca2b..08bfdb5a57 100644 --- a/test/bn_3d_spatial_test.cpp +++ b/test/bn_3d_spatial_test.cpp @@ -327,6 +327,9 @@ struct verify_forward_train_3d_bn_spatial miopen::BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -516,6 +519,9 @@ struct verify_forward_infer_3d_bn_spatial_recalc miopen::BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), nullptr, @@ -632,6 +638,9 @@ struct verify_forward_infer_3d_bn_spatial_use_est miopen::BuildReshaped4DTensorDescriptor(out.desc), out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), + miopen::BuildReshaped4DTensorDescriptor(shift.desc), scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -913,6 +922,9 @@ struct verify_backward_3d_bn_spatial_recalc miopen::BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -1138,6 +1150,9 @@ struct verify_backward_3d_bn_spatial_use_saved miopen::BuildReshaped4DTensorDescriptor(dx_out.desc), dx_out_dev.get(), miopen::BuildReshaped4DTensorDescriptor(scale.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), + miopen::BuildReshaped4DTensorDescriptor(dshift.desc), scale_dev.get(), dscale_dev.get(), dshift_dev.get(), diff --git a/test/bn_peract_test.cpp b/test/bn_peract_test.cpp index 6622230666..4d83e05df7 100644 --- a/test/bn_peract_test.cpp +++ b/test/bn_peract_test.cpp @@ -271,6 +271,9 @@ struct verify_forward_train_bn_per_activation out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -433,6 +436,9 @@ struct verify_forward_infer_bn_per_activation_recalc out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), nullptr, @@ -550,6 +556,9 @@ struct verify_forward_infer_bn_per_activation_use_est out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -716,6 +725,9 @@ struct verify_backward_bn_per_activation_use_saved dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -909,6 +921,9 @@ struct verify_backward_bn_per_activation_recalc dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), diff --git a/test/bn_spatial_test.cpp b/test/bn_spatial_test.cpp index 82d1cc271b..95a8ee099a 100644 --- a/test/bn_spatial_test.cpp +++ b/test/bn_spatial_test.cpp @@ -308,6 +308,9 @@ struct verify_forward_train_bn_spatial out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), expAvgFactor, @@ -484,6 +487,9 @@ struct verify_forward_infer_bn_spatial_recalc out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), nullptr, @@ -596,6 +602,9 @@ struct verify_forward_infer_bn_spatial_use_est out.desc, out_dev.get(), scale.desc, + shift.desc, + shift.desc, + shift.desc, scale_dev.get(), shift_dev.get(), estMean_dev.get(), @@ -853,6 +862,9 @@ struct verify_backward_bn_spatial_recalc dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), @@ -1065,6 +1077,9 @@ struct verify_backward_bn_spatial_use_saved dx_out.desc, dx_out_dev.get(), scale.desc, + dshift.desc, + dshift.desc, + dshift.desc, scale_dev.get(), dscale_dev.get(), dshift_dev.get(), diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index c0c49b06b6..d525b79cf6 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -134,9 +134,9 @@ void convHostForward(const tensor& input, } } -template +template void batchNormSpatialHostInference(const tensor& input, - tensor& output, + tensor& output, const tensor& scale, const tensor& bias, double epsilon, @@ -162,20 +162,21 @@ void batchNormSpatialHostInference(const tensor& input, output(bidx, cidx, row, column) = static_cast(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); + // std::cout << output(bidx, cidx, row, column) << ","; } } } }); } -template +template void batchNormPerActivHostInference(const tensor& input, - tensor& output, + tensor& output, const tensor& scale, const tensor& bias, double epsilon, - const tensor& estimatedMean, - const tensor& estimatedVariance) + const tensor& estimatedMean, + const tensor& estimatedVariance) { int n_batches, channels, height, width; std::tie(n_batches, channels, height, width) = miopen::tien<4>(input.desc.GetLengths()); @@ -202,17 +203,17 @@ void batchNormPerActivHostInference(const tensor& input, }); } -template +template void batchNormSpatialHostFwdTrain(const tensor& input, - tensor& out, + tensor& out, const tensor& scale, const tensor& bias, double epsilon, double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) { int height, width, n_batch, channels; @@ -265,30 +266,38 @@ void batchNormSpatialHostFwdTrain(const tensor& input, } // for (column) } // for (row) } // end for(n_batchs) - - saveMean(0, cidx, 0, 0) = mean_accum; - saveInvVar(0, cidx, 0, 0) = invVar; - - newRunMean = runMean(0, cidx, 0, 0) * (1 - expAvgFactor); - runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp - // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) - adjust = - (n_batch * height * width == 1) ? variance_accum : (nhw / (nhw - 1)) * variance_accum; - runVar(0, cidx, 0, 0) = (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust; + if(!saveMean.data.empty()) + { + saveMean(0, cidx, 0, 0) = mean_accum; + saveInvVar(0, cidx, 0, 0) = invVar; + } + if(!runMean.data.empty()) + { + newRunMean = runMean(0, cidx, 0, 0) * (1 - expAvgFactor); + runMean(0, cidx, 0, 0) = mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp + // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) + adjust = (n_batch * height * width == 1) ? variance_accum + : (nhw / (nhw - 1)) * variance_accum; + runVar(0, cidx, 0, 0) = + (1 - expAvgFactor) * runVar(0, cidx, 0, 0) + expAvgFactor * adjust; + } }); } -template -void batchNormSpatialHostBwdTrain(const tensor& x_input, - const tensor& dy_input, - tensor& dx_out, - const tensor& scale, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) +template +void batchNormSpatialHostBwdTrain(const tensor& x_input, + const tensor& dy_input, + tensor& dx_out, + const tensor& bnScale, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { - int height, width, n_batch, channels; std::tie(n_batch, channels, height, width) = miopen::tien<4>(x_input.desc.GetLengths()); auto nhw = double(height * width * n_batch); @@ -297,14 +306,50 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, par_for(channels, 1, [&](int cidx) { double elemStd = 0.; unsigned int xhat_index; - double mean = savedMean(0, cidx, 0, 0); // HxW elements - double invVar = savedInvVar(0, cidx, 0, 0); // HxW elements + double mean = 0.0; + double invVar = 0.0; double dyelem = 0.; std::vector xhat(static_cast(n_batch) * in_cstride, 0.0); // process the batch per channel dscale(0, cidx, 0, 0) = 0.; dbias(0, cidx, 0, 0) = 0.; + if(!savedMean.data.empty()) + { + + mean = savedMean(0, cidx, 0, 0); // HxW elements + invVar = savedInvVar(0, cidx, 0, 0); // HxW elements + } + else + { + double variance_accum = 0.; + double mean_accum = 0.; + double inv_Var = 0.; + + // process the batch per channel + for(int bidx = 0; bidx < n_batch; bidx++) + { // via mini_batch + for(int row = 0; row < height; row++) + { // via rows + for(int column = 0; column < width; column++) + { // via columns + // #1 calculate the mean + // iterating through the stack of images in the mini_batch + auto inval = static_cast(x_input(bidx, cidx, row, column)); + mean_accum += inval; + variance_accum += inval * inval; + } // end for (column) + } // end for (row) + } // end for (n) + + mean_accum /= nhw; + variance_accum /= nhw; + variance_accum += (-mean_accum * mean_accum); + inv_Var = 1.0 / sqrt(variance_accum); + + mean = mean_accum; + invVar = inv_Var; + } for(int row = 0; row < height; row++) { // via rows for(int column = 0; column < width; column++) @@ -333,29 +378,36 @@ void batchNormSpatialHostBwdTrain(const tensor& x_input, double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; + dx_out(bidx, cidx, row, column) = + static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) } // for (row) }); // for (channel) } -template +template void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, double gamma, double beta, double alpha, - const tensor& x_input, - const tensor& dy_input, - const tensor& y_input, - tensor& dx_out, - const tensor& scale, - const tensor& bias, - tensor& dscale, - tensor& dbias, - const tensor& savedMean, - const tensor& savedInvVar) + const tensor& x_input, + const tensor& dy_input, + const tensor& y_input, + tensor& dx_out, + const tensor& bnScale, + const tensor& bias, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { int height, width, n_batch, channels; @@ -387,7 +439,8 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, elemStd = static_cast(x_input(bidx, cidx, row, column)) - mean; // (x_i - mean) xhat[xhat_index] = elemStd * invVar; - double bnrefowd = scale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); + double bnrefowd = + bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); activationHostBwdElement(activMode, gamma, beta, @@ -408,8 +461,9 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, { // via columns for(int bidx = 0; bidx < n_batch; bidx++) { // via mini_batch - xhat_index = in_cstride * bidx + (width * row + column); - double bnrefowd = scale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); + xhat_index = in_cstride * bidx + (width * row + column); + double bnrefowd = + bnScale(0, cidx, 0, 0) * xhat[xhat_index] + bias(0, cidx, 0, 0); activationHostBwdElement(activMode, gamma, beta, @@ -421,25 +475,25 @@ void batchNormActivSpatialHostBwdTrain(miopenActivationMode_t activMode, // double tmp1 = nhw * dy_input(bidx, cidx, row, column) - dbias(0, cidx, 0, 0); double tmp1 = nhw * dyelem - dbias(0, cidx, 0, 0); double tmp2 = -xhat[xhat_index] * dscale(0, cidx, 0, 0); - double tmp3 = (scale(0, cidx, 0, 0) * invVar) / nhw; - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); + double tmp3 = (bnScale(0, cidx, 0, 0) * invVar) / nhw; + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * (tmp2 + tmp1)); } // end for(n_batchs) } // for (column) } // for (row) }); // for (channel) } -template +template void batchNormPerActHostFwdTrain(const tensor& input, - tensor& out, + tensor& out, const tensor& scale, const tensor& bias, double epsilon, double expAvgFactor, - tensor& saveMean, - tensor& saveInvVar, - tensor& runMean, - tensor& runVar) + tensor& saveMean, + tensor& saveInvVar, + tensor& runMean, + tensor& runVar) { int height, width, n_batch, channels; @@ -483,7 +537,7 @@ void batchNormPerActHostFwdTrain(const tensor& input, elemStd = (input(bidx, cidx, row, column) - mean_accum); // (x_i - mean) inhat = elemStd * elemInvVar; // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta - out(bidx, cidx, row, column) = static_cast( + out(bidx, cidx, row, column) = static_cast( scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column)); } // end for(n_batch) @@ -496,21 +550,21 @@ void batchNormPerActHostFwdTrain(const tensor& input, runVar(0, cidx, row, column) = (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust; - saveMean(0, cidx, row, column) = static_cast(mean_accum); - saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); + saveMean(0, cidx, row, column) = static_cast(mean_accum); + saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); } // for (column) } // for (row) }); } -template +template void batchNormPerActHostBwdTrain(const tensor& x_input, const tensor& dy_input, const tensor& scale, - tensor& dscale, - tensor& dbias, - tensor& dx_out, + tensor& dscale, + tensor& dbias, + tensor& dx_out, const tensor& savedMean, const tensor& savedInvVar) { diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index f5227217e4..e1f192c37d 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -32,17 +32,25 @@ #include "bn_test_data.hpp" #include "test_operations.hpp" +// Define an enum to identify which version of BN api to call +enum BNApiType +{ + testBNAPIV1, + testBNAPIV2, +}; + template -struct BNInferTest : public ::testing::TestWithParam> +struct BNInferTest + : public ::testing::TestWithParam> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout) = GetParam(); + std::tie(bn_config, tensor_layout, api_type) = GetParam(); bn_infer_test_data.SetUpImpl(bn_config, tensor_layout); auto&& handle = get_handle(); @@ -51,21 +59,47 @@ struct BNInferTest : public ::testing::TestWithParam bn_infer_test_data; miopenTensorLayout_t tensor_layout; + BNApiType api_type; }; template -struct BNBwdTest : public ::testing::TestWithParam> +struct BNBwdTest + : public ::testing::TestWithParam> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout) = GetParam(); + std::tie(bn_config, tensor_layout, api_type) = GetParam(); bn_bwd_test_data.SetUpImpl(bn_config, tensor_layout); auto&& handle = get_handle(); - auto res = miopenBatchNormalizationBackward(&handle, - bn_config.mode, - &bn_bwd_test_data.alphaDataDiff, - &bn_bwd_test_data.betaDataDiff, - &bn_bwd_test_data.alphaParamDiff, - &bn_bwd_test_data.betaParamDiff, - &bn_bwd_test_data.input.desc, - bn_bwd_test_data.in_dev.get(), - &bn_bwd_test_data.dy.desc, - bn_bwd_test_data.dy_dev.get(), - &bn_bwd_test_data.output.desc, - bn_bwd_test_data.out_dev.get(), - &bn_bwd_test_data.bnScale.desc, - bn_bwd_test_data.bnScale_dev.get(), - bn_bwd_test_data.dScale_dev.get(), - bn_bwd_test_data.dBias_dev.get(), - bn_bwd_test_data.epsilon, - bn_bwd_test_data.savedMean_dev.get(), - bn_bwd_test_data.savedInvVar_dev.get()); + if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream())) + { + test_skipped = true; + GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture"; + } + miopenStatus_t res = miopenStatusUnknownError; + if(api_type == BNApiType::testBNAPIV1) + { + res = miopenBatchNormalizationBackward(&handle, + bn_config.mode, + &bn_bwd_test_data.alphaDataDiff, + &bn_bwd_test_data.betaDataDiff, + &bn_bwd_test_data.alphaParamDiff, + &bn_bwd_test_data.betaParamDiff, + &bn_bwd_test_data.input.desc, + bn_bwd_test_data.in_dev.get(), + &bn_bwd_test_data.dy.desc, + bn_bwd_test_data.dy_dev.get(), + &bn_bwd_test_data.output.desc, + bn_bwd_test_data.out_dev.get(), + &bn_bwd_test_data.bnScale.desc, + bn_bwd_test_data.bnScale_dev.get(), + bn_bwd_test_data.dScale_dev.get(), + bn_bwd_test_data.dBias_dev.get(), + bn_bwd_test_data.epsilon, + bn_bwd_test_data.savedMean_dev.get(), + bn_bwd_test_data.savedInvVar_dev.get()); + } + else if(api_type == BNApiType::testBNAPIV2) + { + res = miopenBatchNormalizationBackward_V2(&handle, + bn_config.mode, + &bn_bwd_test_data.alphaDataDiff, + &bn_bwd_test_data.betaDataDiff, + &bn_bwd_test_data.alphaParamDiff, + &bn_bwd_test_data.betaParamDiff, + &bn_bwd_test_data.input.desc, + bn_bwd_test_data.in_dev.get(), + &bn_bwd_test_data.dy.desc, + bn_bwd_test_data.dy_dev.get(), + &bn_bwd_test_data.output.desc, + bn_bwd_test_data.out_dev.get(), + &bn_bwd_test_data.bnScale.desc, + &bn_bwd_test_data.dBias.desc, + &bn_bwd_test_data.savedMean.desc, + &bn_bwd_test_data.savedInvVar.desc, + bn_bwd_test_data.bnScale_dev.get(), + bn_bwd_test_data.dScale_dev.get(), + bn_bwd_test_data.dBias_dev.get(), + bn_bwd_test_data.epsilon, + bn_bwd_test_data.savedMean_dev.get(), + bn_bwd_test_data.savedInvVar_dev.get()); + } + else + GTEST_FAIL() << "ERROR: unknown bn api type!!"; if(res != miopenStatusSuccess) { GTEST_FAIL() << "miopenBatchNormalizationBackward failed"; @@ -183,6 +255,7 @@ struct BNBwdTest : public ::testing::TestWithParam bn_bwd_test_data; miopenTensorLayout_t tensor_layout; + BNApiType api_type; }; template struct BNFwdTrainTest - : public ::testing::TestWithParam> + : public ::testing::TestWithParam> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout) = GetParam(); + std::tie(bn_config, tensor_layout, api_type) = GetParam(); bn_fwd_train_test_data.SetUpImpl(bn_config, tensor_layout); auto&& handle = get_handle(); - auto res = - miopenBatchNormalizationForwardTraining(&handle, - bn_config.mode, - &bn_fwd_train_test_data.alpha, - &bn_fwd_train_test_data.beta, - &bn_fwd_train_test_data.input.desc, - bn_fwd_train_test_data.in_dev.get(), - &bn_fwd_train_test_data.output.desc, - bn_fwd_train_test_data.out_dev.get(), - &bn_fwd_train_test_data.scale.desc, - bn_fwd_train_test_data.scale_dev.get(), - bn_fwd_train_test_data.shift_dev.get(), - bn_fwd_train_test_data.averageFactor, - bn_fwd_train_test_data.runMean_dev.get(), - bn_fwd_train_test_data.runVariance_dev.get(), - bn_fwd_train_test_data.epsilon, - bn_fwd_train_test_data.saveMean_dev.get(), - bn_fwd_train_test_data.saveVariance_dev.get()); + if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream())) + { + test_skipped = true; + GTEST_SKIP() << "Not Applicable on " << handle.GetDeviceName() << " Architecture"; + } + miopenStatus_t res = miopenStatusUnknownError; + if(api_type == BNApiType::testBNAPIV1) + { + res = miopenBatchNormalizationForwardTraining( + &handle, + bn_config.mode, + &bn_fwd_train_test_data.alpha, + &bn_fwd_train_test_data.beta, + &bn_fwd_train_test_data.input.desc, + bn_fwd_train_test_data.in_dev.get(), + &bn_fwd_train_test_data.output.desc, + bn_fwd_train_test_data.out_dev.get(), + &bn_fwd_train_test_data.scale.desc, + bn_fwd_train_test_data.scale_dev.get(), + bn_fwd_train_test_data.shift_dev.get(), + bn_fwd_train_test_data.averageFactor, + bn_fwd_train_test_data.runMean_dev.get(), + bn_fwd_train_test_data.runVariance_dev.get(), + bn_fwd_train_test_data.epsilon, + bn_fwd_train_test_data.saveMean_dev.get(), + bn_fwd_train_test_data.saveVariance_dev.get()); + } + else if(api_type == BNApiType::testBNAPIV2) + { + res = miopenBatchNormalizationForwardTraining_V2( + &handle, + bn_config.mode, + &bn_fwd_train_test_data.alpha, + &bn_fwd_train_test_data.beta, + &bn_fwd_train_test_data.input.desc, + bn_fwd_train_test_data.in_dev.get(), + &bn_fwd_train_test_data.output.desc, + bn_fwd_train_test_data.out_dev.get(), + &bn_fwd_train_test_data.scale.desc, + &bn_fwd_train_test_data.shift.desc, + &bn_fwd_train_test_data.saveMean.desc, + &bn_fwd_train_test_data.saveVariance.desc, + bn_fwd_train_test_data.scale_dev.get(), + bn_fwd_train_test_data.shift_dev.get(), + bn_fwd_train_test_data.averageFactor, + bn_fwd_train_test_data.runMean_dev.get(), + bn_fwd_train_test_data.runVariance_dev.get(), + bn_fwd_train_test_data.epsilon, + bn_fwd_train_test_data.saveMean_dev.get(), + bn_fwd_train_test_data.saveVariance_dev.get()); + } + else + GTEST_FAIL() << "ERROR: unknown bn api type!!"; if(res != miopenStatusSuccess) { GTEST_FAIL() << "miopenBatchNormalizationForwardTraining failed"; @@ -275,4 +383,5 @@ struct BNFwdTrainTest BNFwdTrainTestData bn_fwd_train_test_data; miopenTensorLayout_t tensor_layout; + BNApiType api_type; }; diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp index f2d54e8077..df093a4710 100644 --- a/test/gtest/bn_bwd.cpp +++ b/test/gtest/bn_bwd.cpp @@ -26,46 +26,123 @@ #include "bn.hpp" -struct GPU_BNBwd_FP16 +// https://github.com/ROCm/MIOpen/issues/1549 +// NCHW solver accepts +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : half_float::half +// struct GPU_BN_V1_BwdNCHW_FP16 : BNBwdTest +// { +// }; + +// NHWC solver accepts +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : float +struct GPU_BN_V2_BwdNHWC_FP16 : BNBwdTest { }; -struct GPU_BNBwd_FP32 : BNBwdTest +// bf16 NHWC solver accepts is only on CK solver +// XDataType : bfloat16 +// YDataYype : bfloat16 +// ScaleDataType : bfloat16 +// BiasDataType : bfloat16 +// MeanVarDataType : float +struct GPU_BN_V1_BwdNHWC_BFP16 : BNBwdTest +{ +}; + +struct GPU_BN_V2_BwdNHWC_BFP16 : BNBwdTest +{ +}; + +struct GPU_BN_V1_Bwd_FP32 : BNBwdTest +{ +}; + +struct GPU_BN_V2_Bwd_FP32 : BNBwdTest { }; -struct GPU_BNBwd_BFP16 : BNBwdTest +struct GPU_BN_V1_BwdNHWC_FP64 : BNBwdTest { }; -struct GPU_BNBwd_FP64 : BNBwdTest +struct GPU_BN_V2_BwdNHWC_FP64 : BNBwdTest { }; -TEST_P(GPU_BNBwd_FP16, BnBwdCKHalf) {} +// fp16 +// TEST_P(GPU_BN_V1_BwdNCHW_FP16, BnV1BwdHalf) {} +TEST_P(GPU_BN_V2_BwdNHWC_FP16, BnV2BwdCKHalf) {} + +// float +TEST_P(GPU_BN_V1_Bwd_FP32, BnV1BwdFloat) {} +TEST_P(GPU_BN_V2_Bwd_FP32, BnV2BwdFloat) {} + +// bfp16 is only on CK solver +TEST_P(GPU_BN_V1_BwdNHWC_BFP16, BnV1BwdCKBfloat) {} +TEST_P(GPU_BN_V2_BwdNHWC_BFP16, BnV2BwdCKBfloat) {} -TEST_P(GPU_BNBwd_FP32, BnBwdCKFloat) {} +// double is only on CK solver +TEST_P(GPU_BN_V1_BwdNHWC_FP64, BnV1BwdCKDouble) {} +TEST_P(GPU_BN_V2_BwdNHWC_FP64, BnV2BwdCKDouble) {} -TEST_P(GPU_BNBwd_BFP16, BnBwdCKBFloat16) {} -TEST_P(GPU_BNBwd_FP64, BnBwdCKDouble) {} +// // fp16 +// INSTANTIATE_TEST_SUITE_P(Smoke, +// GPU_BN_V1_BwdNCHW_FP16, +// testing::Combine(testing::ValuesIn(NetworkSmall()), +// testing::Values(miopenTensorNCHW), +// testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_BwdNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); + +// fp32 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_Bwd_FP32, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNCHW), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_Bwd_FP32, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// bfp16 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_BwdNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_FP32, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_BwdNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// fp64 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_BFP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_BwdNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNBwd_FP64, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_BwdNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp index b5dbed4705..ed25631175 100644 --- a/test/gtest/bn_fwd_train.cpp +++ b/test/gtest/bn_fwd_train.cpp @@ -26,46 +26,119 @@ #include "bn.hpp" -struct GPU_BNFwdTrain_FP16 +// ** OCL kernel for fwd training are failing gtest ** +// ** Hence, this gtest only tests CK solvers ** + +// NHWC solver accepts +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : float +struct GPU_BN_V1_FwdTrainNHWC_FP16 + : BNFwdTrainTest +{ +}; + +struct GPU_BN_V2_FwdTrainNHWC_FP16 : BNFwdTrainTest { }; -struct GPU_BNFwdTrain_FP32 : BNFwdTrainTest +// bf16 NHWC solver accepts is only on CK solver +// XDataType : bfloat16 +// YDataYype : bfloat16 +// ScaleDataType : bfloat16 +// BiasDataType : bfloat16 +// MeanVarDataType : float +struct GPU_BN_V1_FwdTrainNHWC_BFP16 : BNFwdTrainTest +{ +}; + +struct GPU_BN_V2_FwdTrainNHWC_BFP16 : BNFwdTrainTest { }; -struct GPU_BNFwdTrain_FP64 : BNFwdTrainTest +struct GPU_BN_V1_FwdTrainNHWC_FP32 : BNFwdTrainTest { }; -struct GPU_BNFwdTrain_BFP16 : BNFwdTrainTest +struct GPU_BN_V2_FwdTrainNHWC_FP32 : BNFwdTrainTest { }; -TEST_P(GPU_BNFwdTrain_FP16, BnFwdTrainCKHalf) {} +struct GPU_BN_V1_FwdTrainNHWC_FP64 : BNFwdTrainTest +{ +}; -TEST_P(GPU_BNFwdTrain_FP32, BnFwdTrainCKFloat) {} +struct GPU_BN_V2_FwdTrainNHWC_FP64 : BNFwdTrainTest +{ +}; -TEST_P(GPU_BNFwdTrain_FP64, BnFwdTrainCKDouble) {} -TEST_P(GPU_BNFwdTrain_BFP16, BnFwdTrainCKBFloat16) {} +// fp16 +TEST_P(GPU_BN_V1_FwdTrainNHWC_FP16, BnV1FwdTrainHalf) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_FP16, BnV2FwdTrainCKHalf) {} + +// float +TEST_P(GPU_BN_V1_FwdTrainNHWC_FP32, BnV1FwdTrainFloat) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_FP32, BnV2FwdTrainFloat) {} + +// bfp16 +TEST_P(GPU_BN_V1_FwdTrainNHWC_BFP16, BnV1FwdTrainCKBfloat) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_BFP16, BnV2FwdTrainCKBfloat) {} + +// double +TEST_P(GPU_BN_V1_FwdTrainNHWC_FP64, BnV1FwdTrainCKDouble) {} +TEST_P(GPU_BN_V2_FwdTrainNHWC_FP64, BnV2FwdTrainCKDouble) {} + +// fp16 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_FwdTrainNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_FwdTrainNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); + +// fp32 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_FwdTrainNHWC_FP32, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_FwdTrainNHWC_FP32, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// bfp16 INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_FwdTrainNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_FP32, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_FwdTrainNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// fp64 INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_FP64, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_FwdTrainNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFwdTrain_BFP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V2_FwdTrainNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp index 081d583213..aea15c097e 100644 --- a/test/gtest/bn_infer.cpp +++ b/test/gtest/bn_infer.cpp @@ -27,66 +27,124 @@ #include "bn.hpp" // NCHW solver accepts -// XDataType : half -// YDataYype : half -// ScaleDataType : float -// BiasDataType : float -// MeanVarDataType : float -struct GPU_BNInferNCHW_FP16 : BNInferTest +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : half_float::half +struct GPU_BN_V1_InferNCHW_FP16 : BNInferTest { }; // NHWC solver accepts -// XDataType : half -// YDataYype : half -// ScaleDataType : half -// BiasDataType : half +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half // MeanVarDataType : float -struct GPU_BNInferNHWC_FP16 +struct GPU_BN_V2_InferNHWC_FP16 : BNInferTest { }; -struct GPU_BNInfer_FP32 : BNInferTest +// bf16 NHWC solver accepts is only on CK solver +// XDataType : bfloat16 +// YDataYype : bfloat16 +// ScaleDataType : bfloat16 +// BiasDataType : bfloat16 +// MeanVarDataType : float +struct GPU_BN_V1_InferNHWC_BFP16 : BNInferTest +{ +}; + +struct GPU_BN_V2_InferNHWC_BFP16 : BNInferTest { }; -struct GPU_BNInfer_FP64 : BNInferTest +struct GPU_BN_V1_Infer_FP32 : BNInferTest { }; -struct GPU_BNInfer_BFP16 : BNInferTest +struct GPU_BN_V2_Infer_FP32 : BNInferTest { }; -TEST_P(GPU_BNInferNCHW_FP16, BnInferCKHalf) {} -TEST_P(GPU_BNInferNHWC_FP16, BnInferCKHalf) {} +struct GPU_BN_V1_InferNHWC_FP64 : BNInferTest +{ +}; + +struct GPU_BN_V2_InferNHWC_FP64 : BNInferTest +{ +}; -TEST_P(GPU_BNInfer_FP32, BnInferCKFloat) {} -TEST_P(GPU_BNInfer_FP64, BnInferCKDouble) {} -TEST_P(GPU_BNInfer_BFP16, BnInferCKBFloat16) {} +// fp16 +TEST_P(GPU_BN_V1_InferNCHW_FP16, BnV1InferHalf) {} +TEST_P(GPU_BN_V2_InferNHWC_FP16, BnV2InferCKHalf) {} + +// float +TEST_P(GPU_BN_V1_Infer_FP32, BnV1InferFloat) {} +TEST_P(GPU_BN_V2_Infer_FP32, BnV2InferFloat) {} + +// bfp16 is only on CK solver +TEST_P(GPU_BN_V1_InferNHWC_BFP16, BnV1InferCKBfloat) {} +TEST_P(GPU_BN_V2_InferNHWC_BFP16, BnV2InferCKBfloat) {} + +// double is only on CK solver +TEST_P(GPU_BN_V1_InferNHWC_FP64, BnV1InferCKDouble) {} +TEST_P(GPU_BN_V2_InferNHWC_FP64, BnV2InferCKDouble) {} + +// fp16 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_InferNCHW_FP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNCHW), + testing::ValuesIn({testBNAPIV1}))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V2_InferNHWC_FP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); + +// fp32 +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BN_V1_Infer_FP32, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNCHW), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInferNCHW_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNCHW))); + GPU_BN_V2_Infer_FP32, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// bfp16 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInferNHWC_FP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::Values(miopenTensorNHWC))); + GPU_BN_V1_InferNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInfer_FP32, - testing::Combine(testing::ValuesIn(Network1()), - testing::ValuesIn({miopenTensorNHWC, miopenTensorNCHW}))); + GPU_BN_V2_InferNHWC_BFP16, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); +// fp64 is only on CK solver INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInfer_FP64, - testing::Combine(testing::ValuesIn(Network1()), - testing::ValuesIn({miopenTensorNHWC}))); + GPU_BN_V1_InferNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkSmall()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV1}))); INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNInfer_BFP16, - testing::Combine(testing::ValuesIn(Network1()), - testing::ValuesIn({miopenTensorNHWC}))); + GPU_BN_V2_InferNHWC_FP64, + testing::Combine(testing::ValuesIn(NetworkLarge()), + testing::Values(miopenTensorNHWC), + testing::ValuesIn({testBNAPIV2}))); diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index fcf237400b..9afa8ea4ed 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -56,10 +56,13 @@ struct BNTestCase }; template -std::vector Network1(); +std::vector NetworkSmall(); + +template +std::vector NetworkLarge(); template <> -inline std::vector Network1() +inline std::vector NetworkLarge() { // pyt_mlperf_resnet50v1.5 return { @@ -95,6 +98,19 @@ inline std::vector Network1() {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}}; } +template <> +inline std::vector NetworkSmall() +{ + // pyt_mlperf_resnet50v1.5 + return { + {192, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 132, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, + {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, + + }; +} + template struct BNTestData { From e006bc467f733e1886cc202122d4e522a99e384e Mon Sep 17 00:00:00 2001 From: Chris Erb Date: Thu, 17 Oct 2024 12:00:03 -0500 Subject: [PATCH 2/4] gfx942kdb update (#3319) * regenerate gfx942 kdb * add gfx942kdb back to cmakelist --- CMakeLists.txt | 4 ++-- src/kernels/gfx942.kdb.bz2 | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 src/kernels/gfx942.kdb.bz2 diff --git a/CMakeLists.txt b/CMakeLists.txt index 917ebd18fb..b1d8dc1dbf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -652,8 +652,8 @@ function(install_kdb FILE_NAME COMPONENT_NAME) endfunction() # Both the lists below should be in sync always -set(KDB_BZ2_FILES gfx90a.kdb.bz2 gfx1030.kdb.bz2 gfx908.kdb.bz2 gfx906.kdb.bz2 gfx900.kdb.bz2) -set(COMPONENT_LST gfx90akdb gfx1030kdb gfx908kdb gfx906kdb gfx900kdb) +set(KDB_BZ2_FILES gfx942.kdb.bz2 gfx90a.kdb.bz2 gfx1030.kdb.bz2 gfx908.kdb.bz2 gfx906.kdb.bz2 gfx900.kdb.bz2) +set(COMPONENT_LST gfx942kdb gfx90akdb gfx1030kdb gfx908kdb gfx906kdb gfx900kdb) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.17) foreach(__file __component IN ZIP_LISTS KDB_BZ2_FILES COMPONENT_LST) diff --git a/src/kernels/gfx942.kdb.bz2 b/src/kernels/gfx942.kdb.bz2 new file mode 100644 index 0000000000..506f8aa4ee --- /dev/null +++ b/src/kernels/gfx942.kdb.bz2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62522bd6140cfa23928ab5cdadf6541135b991ef856056219ed88fadf90cd93b +size 1670498 From eecfb26b2988d1b4b7aa8b549f2adf5521917eb9 Mon Sep 17 00:00:00 2001 From: BrianHarrisonAMD <169072757+BrianHarrisonAMD@users.noreply.github.com> Date: Thu, 17 Oct 2024 11:06:50 -0600 Subject: [PATCH 3/4] Merge CK fwd mha FP16 solver (#3308) --- CMakeLists.txt | 2 +- src/CMakeLists.txt | 3 +- src/include/miopen/mha/solvers.hpp | 22 ++ src/problem.cpp | 4 +- src/solution.cpp | 34 ++- src/solver.cpp | 4 +- .../mha/mha_ck_fa_v2_solver_forward.cpp | 268 ++++++++++++++++++ test/gtest/cpu_multi_head_attention.hpp | 56 ++-- test/gtest/gpu_mha_backward.cpp | 2 +- test/gtest/gpu_mha_forward.cpp | 257 +++++++++++++---- test/gtest/graphapi_capi_mha_backward.cpp | 2 +- test/gtest/graphapi_capi_mha_common.hpp | 14 +- test/gtest/graphapi_capi_mha_forward.cpp | 52 +++- test/gtest/graphapi_mha_cpp_common.hpp | 36 +-- test/gtest/mha_helper.hpp | 75 +++-- 15 files changed, 694 insertions(+), 137 deletions(-) create mode 100644 src/solver/mha/mha_ck_fa_v2_solver_forward.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index b1d8dc1dbf..7b9e081c8d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -327,7 +327,7 @@ add_compile_definitions($<$:HIP_COMPILER_FLAGS=${HIP_COMPI # HIP if( MIOPEN_BACKEND STREQUAL "HIP" OR MIOPEN_BACKEND STREQUAL "HIPOC" OR MIOPEN_BACKEND STREQUAL "HIPNOGPU") if(MIOPEN_USE_COMPOSABLEKERNEL) - find_package(composable_kernel 1.0.0 COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations) + find_package(composable_kernel 1.0.0 COMPONENTS device_other_operations device_gemm_operations device_conv_operations device_reduction_operations device_mha_operations) endif() if( MIOPEN_BACKEND STREQUAL "HIPNOGPU") set(MIOPEN_MODE_NOGPU 1) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5f119620d2..376d7aaa00 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -311,6 +311,7 @@ set( MIOpen_Source solver/layernorm/forward_layernorm2d_ck.cpp solver/layernorm/forward_layernorm4d_ck.cpp solver/layernorm/forward_t5layernorm.cpp + solver/mha/mha_ck_fa_v2_solver_forward.cpp solver/mha/mha_solver_backward.cpp solver/mha/mha_solver_forward.cpp solver/multimarginloss/forward_multimarginloss.cpp @@ -845,7 +846,7 @@ target_include_directories(MIOpen PUBLIC ) if(MIOPEN_USE_COMPOSABLEKERNEL) -set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_other_operations composable_kernel::device_gemm_operations composable_kernel::device_conv_operations composable_kernel::device_reduction_operations hip::host) +set(MIOPEN_CK_LINK_FLAGS composable_kernel::device_other_operations composable_kernel::device_gemm_operations composable_kernel::device_conv_operations composable_kernel::device_reduction_operations composable_kernel::device_mha_operations hip::host) endif() if(WIN32) diff --git a/src/include/miopen/mha/solvers.hpp b/src/include/miopen/mha/solvers.hpp index 6bac473a71..55423f63c7 100644 --- a/src/include/miopen/mha/solvers.hpp +++ b/src/include/miopen/mha/solvers.hpp @@ -77,6 +77,28 @@ struct MhaBackward final : MhaSolver MIOPEN_INTERNALS_EXPORT bool MayNeedWorkspace() const override; }; +struct MhaCKFlashAttentionV2Forward final : MhaSolver +{ + const std::string& SolverDbId() const override + { + return GetSolverDbId(); + } + + MIOPEN_INTERNALS_EXPORT bool + IsApplicable(const ExecutionContext& context, + const miopen::mha::ProblemDescription& problem) const override; + + MIOPEN_INTERNALS_EXPORT ConvSolution + GetSolution(const ExecutionContext& context, + const miopen::mha::ProblemDescription& problem) const override; + + MIOPEN_INTERNALS_EXPORT std::size_t + GetWorkspaceSize(const ExecutionContext& context, + const miopen::mha::ProblemDescription& problem) const override; + + MIOPEN_INTERNALS_EXPORT bool MayNeedWorkspace() const override; +}; + } // namespace mha } // namespace solver diff --git a/src/problem.cpp b/src/problem.cpp index 8b406516ef..ba84856850 100644 --- a/src/problem.cpp +++ b/src/problem.cpp @@ -610,10 +610,12 @@ Problem::FindSolutionsImpl(Handle& handle, const auto algo = AlgorithmName{"Mha"}; + static solver::mha::MhaCKFlashAttentionV2Forward mhaCKFAForwardSolver; static solver::mha::MhaForward mhaForwardSolver; static solver::mha::MhaBackward mhaBackwardSolver; - std::vector solvers = {&mhaForwardSolver, &mhaBackwardSolver}; + std::vector solvers = { + &mhaCKFAForwardSolver, &mhaForwardSolver, &mhaBackwardSolver}; for(auto solver : solvers) { diff --git a/src/solution.cpp b/src/solution.cpp index 4fe447423a..3df767c90b 100644 --- a/src/solution.cpp +++ b/src/solution.cpp @@ -400,15 +400,32 @@ void Solution::RunImpl(Handle& handle, return; } - solver::mha::MhaForward mhaForward; - solver::mha::MhaBackward mhaBackward; + auto getSolution = [&](const ExecutionContext& ctx) { + auto solverId = GetSolver(); + solver::mha::MhaForward mhaForward; + solver::mha::MhaBackward mhaBackward; + solver::mha::MhaCKFlashAttentionV2Forward ckMhaForward; + + if(solverId == ckMhaForward.SolverDbId()) + { + return ckMhaForward.GetSolution(ctx, problem_description); + } + else if(solverId == mhaForward.SolverDbId()) + { + return mhaForward.GetSolution(ctx, problem_description); + } + else if(solverId == mhaBackward.SolverDbId()) + { + return mhaBackward.GetSolution(ctx, problem_description); + } + + MIOPEN_THROW("No MHA solver with matching SolverDbId of " + solverId.ToString()); + }; if(!kernels.empty()) { const auto ctx = ExecutionContext{&handle}; - const auto mha_solution = GetSolver() == mhaForward.SolverDbId() - ? mhaForward.GetSolution(ctx, problem_description) - : mhaBackward.GetSolution(ctx, problem_description); + const auto mha_solution = getSolution(ctx); auto kernel_handles = std::vector{std::begin(kernels), std::end(kernels)}; invoker = (*mha_solution.invoker_factory)(kernel_handles); @@ -425,11 +442,8 @@ void Solution::RunImpl(Handle& handle, return; } - auto ctx = ExecutionContext{&handle}; - - const auto mha_solution = GetSolver() == mhaForward.SolverDbId() - ? mhaForward.GetSolution(ctx, problem_description) - : mhaBackward.GetSolution(ctx, problem_description); + auto ctx = ExecutionContext{&handle}; + const auto mha_solution = getSolution(ctx); invoker = handle.PrepareInvoker(*mha_solution.invoker_factory, mha_solution.construction_params); diff --git a/src/solver.cpp b/src/solver.cpp index d0ef398729..912479b9f5 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -700,7 +700,9 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Primitive::MultiMarginLoss, multimarginloss::MultiMarginLossForward{}.SolverDbId()); - // IMPORTANT: New solvers should be added to the end of the function! + Register(registry, ++id, Primitive::Mha, mha::MhaCKFlashAttentionV2Forward{}.SolverDbId()); + // IMPORTANT: New solvers should be added to the end of the function, and don't leave a white + // space between this comment and the newly registered solver(s)! } bool ThisSolverIsDeprecatedStatic::IsDisabled(const ExecutionContext& ctx) diff --git a/src/solver/mha/mha_ck_fa_v2_solver_forward.cpp b/src/solver/mha/mha_ck_fa_v2_solver_forward.cpp new file mode 100644 index 0000000000..6313f3920f --- /dev/null +++ b/src/solver/mha/mha_ck_fa_v2_solver_forward.cpp @@ -0,0 +1,268 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include +#include +#include + +#if MIOPEN_USE_COMPOSABLEKERNEL +#include "ck_tile/ops/fmha.hpp" +#include "ck_tile/ops/fmha_fwd.hpp" +#include "ck/stream_config.hpp" +#endif + +MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_FA_CK_V2_FWD) + +namespace miopen { + +namespace solver { + +namespace mha { + +#if MIOPEN_USE_COMPOSABLEKERNEL +static std::string Convert(miopenDataType_t dataType) +{ + switch(dataType) + { + case miopenHalf: { + return "fp16"; + } + case miopenBFloat16: { + return "bfp16"; + } + default: { + MIOPEN_THROW("Unsupported datatype provided"); + } + } +} +#endif + +bool MhaCKFlashAttentionV2Forward::IsApplicable( + [[maybe_unused]] const ExecutionContext& context, + const miopen::mha::ProblemDescription& problem) const +{ +#if MIOPEN_USE_COMPOSABLEKERNEL + if(!problem.IsForward()) + { + return false; + } + + auto deviceName = context.GetStream().GetDeviceName(); + if(!StartsWith(deviceName, "gfx94") && deviceName != "gfx90a") + { + return false; + } + + const auto& descsFwd = problem.GetDescsForward(); + auto [N_k, H_k, S_k, D_k] = miopen::tien<4>(descsFwd.kDesc.GetLengths()); + auto [N_stride_k, H_stride_k, S_stride_k, D_stride_k] = + miopen::tien<4>(descsFwd.kDesc.GetStrides()); + + auto [N_q, H_q, S_q, D_q] = miopen::tien<4>(descsFwd.qDesc.GetLengths()); + auto [N_stride_q, H_stride_q, S_stride_q, D_stride_q] = + miopen::tien<4>(descsFwd.qDesc.GetStrides()); + + auto [N_stride_v, H_stride_v, S_stride_v, D_stride_v] = + miopen::tien<4>(descsFwd.vDesc.GetStrides()); + + auto [N_stride_o, H_stride_o, S_stride_o, D_stride_o] = + miopen::tien<4>(descsFwd.oDesc.GetStrides()); + + return !env::disabled(MIOPEN_DEBUG_FA_CK_V2_FWD) // + && H_q == H_k // Replace with H_q % H_k == 0 once we add support for MQA & GQA. + && D_q <= 256 // + && D_q % 8 == 0 // + && descsFwd.kDesc.IsPacked() // + && descsFwd.qDesc.IsPacked() // + && descsFwd.vDesc.IsPacked() // + && descsFwd.oDesc.IsPacked() // + && descsFwd.kDesc.GetType() == miopenHalf // + && descsFwd.qDesc.GetType() == miopenHalf // + && descsFwd.vDesc.GetType() == miopenHalf // + && descsFwd.oDesc.GetType() == miopenHalf // + && D_stride_k == 1 // CK requires D stride as 1. + && D_stride_q == 1 && D_stride_v == 1 && D_stride_o == 1; +#else + return false; +#endif +} + +std::size_t MhaCKFlashAttentionV2Forward::GetWorkspaceSize( + [[maybe_unused]] const ExecutionContext& context, + [[maybe_unused]] const miopen::mha::ProblemDescription& problem) const +{ + return 0; +} + +ConvSolution +MhaCKFlashAttentionV2Forward::GetSolution([[maybe_unused]] const ExecutionContext& context, + const miopen::mha::ProblemDescription& problem) const +{ +#if MIOPEN_USE_COMPOSABLEKERNEL + auto result = ConvSolution{miopenStatusSuccess}; + result.workspace_sz = 0; + + const miopen::mha::MhaInputDescsForward& descsFwd = problem.GetDescsForward(); + auto [N_k, H_k, S_k, D_k] = miopen::tien<4>(descsFwd.kDesc.GetLengths()); + auto [N_stride_k, H_stride_k, S_stride_k, D_stride_k] = + miopen::tien<4>(descsFwd.kDesc.GetStrides()); + + auto [N_q, H_q, S_q, D_q] = miopen::tien<4>(descsFwd.qDesc.GetLengths()); + auto [N_stride_q, H_stride_q, S_stride_q, D_stride_q] = + miopen::tien<4>(descsFwd.qDesc.GetStrides()); + + auto [N_v, H_v, S_v, D_v] = miopen::tien<4>(descsFwd.vDesc.GetLengths()); + auto [N_stride_v, H_stride_v, S_stride_v, D_stride_v] = + miopen::tien<4>(descsFwd.vDesc.GetStrides()); + + auto [N_stride_o, H_stride_o, S_stride_o, D_stride_o] = + miopen::tien<4>(descsFwd.oDesc.GetStrides()); + + float scale_s = descsFwd.scale; + float scale_p = 1.0; + float scale_o = 1.0; + + fmha_fwd_traits fmha_traits; + fmha_traits.hdim_q = D_q; + fmha_traits.hdim_v = D_v; + fmha_traits.data_type = Convert(descsFwd.qDesc.GetType()); + fmha_traits.is_group_mode = false; + // is_v_rowmajor relates to the layout of the V tensor. Row major means NHSD, and Col major + // means NHDS. + fmha_traits.is_v_rowmajor = true; + fmha_traits.mask_type = mask_enum::no_mask; + fmha_traits.has_lse = false; + fmha_traits.do_fp8_static_quant = false; + fmha_traits.has_dropout = false; + fmha_traits.bias_type = bias_enum::no_bias; + + fmha_fwd_args fmha_args; + fmha_args.hdim_q = D_q; + fmha_args.hdim_v = D_v; + fmha_args.seqlen_k = S_k; + fmha_args.seqlen_q = S_q; + fmha_args.max_seqlen_q = S_q; + fmha_args.nhead_q = H_q; + fmha_args.nhead_k = H_k; + fmha_args.batch = N_q; + fmha_args.stride_q = S_stride_q; + fmha_args.stride_k = S_stride_k; + fmha_args.stride_v = S_stride_v; + fmha_args.stride_o = S_stride_o; + fmha_args.stride_bias = 0; + fmha_args.stride_randval = S_q; + fmha_args.nhead_stride_q = H_stride_q; + fmha_args.nhead_stride_k = H_stride_k; + fmha_args.nhead_stride_v = H_stride_v; + fmha_args.nhead_stride_o = H_stride_o; + fmha_args.nhead_stride_lse = S_q; + fmha_args.nhead_stride_bias = 0; + fmha_args.nhead_stride_randval = S_q * S_k; + fmha_args.batch_stride_q = N_stride_q; + fmha_args.batch_stride_k = N_stride_k; + fmha_args.batch_stride_v = N_stride_v; + fmha_args.batch_stride_o = N_stride_o; + fmha_args.batch_stride_lse = H_q * S_q; + fmha_args.batch_stride_bias = 0; + fmha_args.batch_stride_randval = H_q * S_q * S_k; + + // These are used for group mode, and we are in batch right now. + fmha_args.seqstart_q_ptr = nullptr; + fmha_args.seqstart_k_ptr = nullptr; + + // Batch does not support padding, and we aren't using kvcache yet. + fmha_args.seqlen_k_ptr = nullptr; + + fmha_args.s_randval = false; + fmha_args.scale_s = scale_s; + fmha_args.scale_p = scale_p; + fmha_args.scale_o = scale_o; + fmha_args.window_size_left = 0; + fmha_args.window_size_right = 0; + fmha_args.mask_type = static_cast(fmha_traits.mask_type); + + result.invoker_factory = [=](const std::vector&) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + const auto& dataFwd = params.GetDataForward(); + + fmha_fwd_traits fmha_runtime_traits = fmha_traits; + fmha_fwd_args fmha_runtime_args = fmha_args; + + fmha_runtime_args.q_ptr = dataFwd.qData; + fmha_runtime_args.k_ptr = dataFwd.kData; + fmha_runtime_args.v_ptr = dataFwd.vData; + fmha_runtime_args.o_ptr = dataFwd.oData; + fmha_runtime_args.rand_val_ptr = nullptr; + fmha_runtime_args.bias_ptr = nullptr; + fmha_runtime_args.lse_ptr = nullptr; + + // Top-left causal mask + if(dataFwd.mask == miopenMhaMask_t::miopenMhaMaskCausal) + { + fmha_runtime_traits.mask_type = mask_enum::mask_top_left; + fmha_runtime_args.mask_type = + static_cast(mask_enum::mask_top_left); + fmha_runtime_args.window_size_left = -1; + fmha_runtime_args.window_size_right = 0; + } + + fmha_runtime_traits.has_dropout = false; + float probability = 0; + + // TODO : Change API to take in probability value as host side value instead of device + // pointer to match CK API. Calling a blocking hipMemcpy will cause issues with stream, + // and isn't async. + + fmha_runtime_args.p_drop = probability; + fmha_runtime_args.drop_seed_offset = + std::make_pair(dataFwd.dropoutSeedData, dataFwd.dropoutOffsetData); + + // Create stream_config, and set it to not time kernel. + ck_tile::stream_config stream_config; + stream_config.stream_id_ = handle_.GetStream(); + + { + HipEventProfiler profiler(handle_); + fmha_fwd(fmha_runtime_traits, fmha_runtime_args, stream_config); + } + }; + }; + + return result; +#else + return ConvSolution{miopenStatusNotImplemented}; +#endif +} + +bool MhaCKFlashAttentionV2Forward::MayNeedWorkspace() const { return false; } + +} // namespace mha + +} // namespace solver + +} // namespace miopen diff --git a/test/gtest/cpu_multi_head_attention.hpp b/test/gtest/cpu_multi_head_attention.hpp index 52fc8bcde6..f155f6c41b 100644 --- a/test/gtest/cpu_multi_head_attention.hpp +++ b/test/gtest/cpu_multi_head_attention.hpp @@ -81,16 +81,16 @@ struct CPUMHATest : public ::testing::TestWithParam if constexpr(std::is_same_v) { // forward - MultiHeadAttentionf32(q_val, - k_val, - v_val, - q_dot_k_transpose, - softmax, - attn_max, - z_sum, - aMax_S, - aMax_O, - multi_head_attention); + MultiHeadAttentionForwardf32(q_val, + k_val, + v_val, + q_dot_k_transpose, + softmax, + attn_max, + z_sum, + aMax_S, + aMax_O, + multi_head_attention); Concat(multi_head_attention, concatinated_attention); @@ -138,24 +138,24 @@ struct CPUMHATest : public ::testing::TestWithParam ScaleMult(v_val, v_scale, v_val_fp8); // forward - MultiHeadAttentionfp8(q_val_fp8, - k_val_fp8, - v_val_fp8, - softmax, // fp32 - attn_max, - z_sum, - q_descale, - k_descale, - v_descale, - s_descale, - s_scale, - o_scale, - 0.0f, - 0, - 0, - aMax_S, - aMax_O, - multi_head_attention); + MultiHeadAttentionForwardfp8(q_val_fp8, + k_val_fp8, + v_val_fp8, + softmax, // fp32 + attn_max, + z_sum, + q_descale, + k_descale, + v_descale, + s_descale, + s_scale, + o_scale, + 0.0f, + 0, + 0, + aMax_S, + aMax_O, + multi_head_attention); Concat(multi_head_attention, final_transformed_attention); ScaleMult(final_transformed_attention, o_descale, concatinated_attention); diff --git a/test/gtest/gpu_mha_backward.cpp b/test/gtest/gpu_mha_backward.cpp index cde87797ab..86a28d4e43 100644 --- a/test/gtest/gpu_mha_backward.cpp +++ b/test/gtest/gpu_mha_backward.cpp @@ -204,7 +204,7 @@ class Test_Bwd_Mha : public testing::TestWithParam // proper O, M and zInv tensors are required for backward pass. // randomly generated M and zInv may lead to nan\inf values - test::cpu::MultiHeadAttentionfp8( + test::cpu::MultiHeadAttentionForwardfp8( std::get>(tensors[miopenTensorMhaQ]->m_cpu_tensor), std::get>(tensors[miopenTensorMhaK]->m_cpu_tensor), std::get>(tensors[miopenTensorMhaV]->m_cpu_tensor), diff --git a/test/gtest/gpu_mha_forward.cpp b/test/gtest/gpu_mha_forward.cpp index fe36355cdb..638c050bad 100644 --- a/test/gtest/gpu_mha_forward.cpp +++ b/test/gtest/gpu_mha_forward.cpp @@ -57,7 +57,8 @@ struct TensorStruct ~TensorStruct() = default; - std::variant, tensor, tensor> m_cpu_tensor; + std::variant, tensor, tensor, tensor> + m_cpu_tensor; Allocator::ManageDataPtr m_gpu_buffer; }; @@ -105,12 +106,50 @@ inline std::vector GetFullTestCases() {11, 150, 256, 31, 0.4f}, }; } + +inline std::vector GetFp16SmokeCases() +{ + return { + {2, 1, 1, 256, 0.0f}, + {2, 2, 65, 128, 0.0f}, + {3, 2, 257, 64, 0.0f}, + {3, 5, 528, 32, 0.0f}, + {3, 7, 712, 16, 0.0f}, + {5, 3, 1111, 8, 0.0f}, + }; +} + +inline std::vector GetFp16FullTestCases() +{ + return { + {3, 11, 1731, 8, 0.0f}, + {2049, 5, 7, 8, 0.0f}, + {5, 2000, 32, 8, 0.0f}, + {3, 9, 1407, 16, 0.0f}, + {1027, 5, 21, 16, 0.0f}, + {5, 1040, 32, 24, 0.0f}, + {3, 7, 1212, 32, 0.0f}, + {550, 5, 16, 40, 0.0f}, + {5, 550, 40, 48, 0.0f}, + {2, 9, 1057, 64, 0.0f}, + {250, 3, 19, 72, 0.0f}, + {5, 230, 27, 80, 0.0f}, + {2, 5, 920, 128, 0.0f}, + {111, 2, 27, 136, 0.0f}, + {3, 110, 22, 152, 0.0f}, + {2, 4, 600, 224, 0.0f}, + {57, 1, 63, 232, 0.0f}, + {2, 65, 18, 256, 0.0f}, + }; +} + } // namespace template class Test_Fwd_Mha : public testing::TestWithParam { - static_assert(std::is_same_v || std::is_same_v); + static_assert(std::is_same_v || std::is_same_v || + std::is_same_v); protected: void SetUp() override @@ -124,6 +163,8 @@ class Test_Fwd_Mha : public testing::TestWithParam GTEST_SKIP() << "CPU Dropout currently supports only fully occupied warps"; } + dropout = drop; + mha_descriptor.SetParams(1); ASSERT_EQ(miopenCreateMhaProblem(&problem, &mha_descriptor, miopenProblemDirectionForward), miopenStatusSuccess); @@ -200,27 +241,65 @@ class Test_Fwd_Mha : public testing::TestWithParam mDesc_ref = tensor{n, h, s, 1}; zInvDesc_ref = tensor{n, h, s, 1}; - test::cpu::MultiHeadAttentionfp8( - std::get>(tensors[miopenTensorMhaQ]->m_cpu_tensor), - std::get>(tensors[miopenTensorMhaK]->m_cpu_tensor), - std::get>(tensors[miopenTensorMhaV]->m_cpu_tensor), - softmax_ref, - mDesc_ref, - zInvDesc_ref, - q.mDescale, - k.mDescale, - v.mDescale, - s_descale, - s_scale, - o_scale, - drop, - std::get>(tensors[miopenTensorMhaDropoutSeed]->m_cpu_tensor) - .data.front(), - std::get>(tensors[miopenTensorMhaDropoutOffset]->m_cpu_tensor) - .data.front(), - amaxS_ref, - amaxO_ref, - oDesc_ref); + RunReference(std::get>(tensors[miopenTensorMhaQ]->m_cpu_tensor), + std::get>(tensors[miopenTensorMhaK]->m_cpu_tensor), + std::get>(tensors[miopenTensorMhaV]->m_cpu_tensor), + softmax_ref, + mDesc_ref, + zInvDesc_ref, + q.mDescale, + k.mDescale, + v.mDescale, + s_descale, + s_scale, + o_scale, + dropout, + std::get>(tensors[miopenTensorMhaDropoutSeed]->m_cpu_tensor) + .data.front(), + std::get>(tensors[miopenTensorMhaDropoutOffset]->m_cpu_tensor) + .data.front(), + amaxS_ref, + amaxO_ref, + oDesc_ref); + } + + virtual void RunReference(const tensor& q_val, + const tensor& k_val, + const tensor& v_val, + tensor& softmax, + tensor& attn_max, + tensor& Z_sum, + float q_descale, + float k_descale, + float v_descale, + float s_descale, + float s_scale, + float o_scale, + float dropout_rate, + uint64_t seed, + uint64_t offset, + float& aMax_S, + float& aMax_O, + tensor& multi_head_attention_fp8) + { + test::cpu::MultiHeadAttentionForwardfp8(q_val, + k_val, + v_val, + softmax, + attn_max, + Z_sum, + q_descale, + k_descale, + v_descale, + s_descale, + s_scale, + o_scale, + dropout_rate, + seed, + offset, + aMax_S, + aMax_O, + multi_head_attention_fp8); } void TestBody() override @@ -249,40 +328,45 @@ class Test_Fwd_Mha : public testing::TestWithParam &handle, solution, args.size(), args.data(), workspace.ptr(), workspace.size()), miopenStatusSuccess); - auto GetResult = [this, &handle](miopenTensorArgumentId_t id, auto type) { - using ResultT = std::decay_t; - auto& tensorStructPtr = tensors[id]; - auto& cpu_tensor = std::get>(tensorStructPtr->m_cpu_tensor); + VerifyResults(handle); + } + } - cpu_tensor.data = - handle.Read(tensorStructPtr->m_gpu_buffer, cpu_tensor.data.size()); + virtual void VerifyResults(Handle& handle) + { + auto GetResult = [this, &handle](miopenTensorArgumentId_t id, auto type) { + using ResultT = std::decay_t; + auto& tensorStructPtr = tensors[id]; + auto& cpu_tensor = std::get>(tensorStructPtr->m_cpu_tensor); - return cpu_tensor; - }; + cpu_tensor.data = + handle.Read(tensorStructPtr->m_gpu_buffer, cpu_tensor.data.size()); - const double error_threshold = 5e-6; - const double fp8_error_threshold = (std::is_same_v) ? 2e-4 : error_threshold; + return cpu_tensor; + }; - const auto& resAmaxS = GetResult(miopenTensorMhaAmaxS, float{}); - auto amaxS_abs_diff = std::abs(amaxS_ref - resAmaxS[0]); - EXPECT_LT(amaxS_abs_diff, error_threshold) - << " ref: " << amaxS_ref << " result: " << resAmaxS[0]; + const double error_threshold = 5e-6; + const double fp8_error_threshold = (std::is_same_v) ? 2e-4 : error_threshold; - const auto& resAmaxO = GetResult(miopenTensorMhaAmaxO, float{}); - auto amaxO_abs_diff = std::abs(amaxO_ref - resAmaxO[0]); - EXPECT_LT(amaxO_abs_diff, error_threshold) - << " ref: " << amaxO_ref << " result: " << resAmaxO[0]; + const auto& resAmaxS = GetResult(miopenTensorMhaAmaxS, float{}); + auto amaxS_abs_diff = std::abs(amaxS_ref - resAmaxS[0]); + EXPECT_LT(amaxS_abs_diff, error_threshold) + << " ref: " << amaxS_ref << " result: " << resAmaxS[0]; - double M_error = miopen::rms_range(mDesc_ref, GetResult(miopenTensorMhaM, float{})); - EXPECT_LT(M_error, error_threshold); + const auto& resAmaxO = GetResult(miopenTensorMhaAmaxO, float{}); + auto amaxO_abs_diff = std::abs(amaxO_ref - resAmaxO[0]); + EXPECT_LT(amaxO_abs_diff, error_threshold) + << " ref: " << amaxO_ref << " result: " << resAmaxO[0]; - double ZInv_error = - miopen::rms_range(zInvDesc_ref, GetResult(miopenTensorMhaZInv, float{})); - EXPECT_LT(ZInv_error, error_threshold); + double M_error = miopen::rms_range(mDesc_ref, GetResult(miopenTensorMhaM, float{})); + EXPECT_LT(M_error, error_threshold); - double O_error = miopen::rms_range(oDesc_ref, GetResult(miopenTensorMhaO, T{})); - EXPECT_LT(O_error, fp8_error_threshold); - } + double ZInv_error = + miopen::rms_range(zInvDesc_ref, GetResult(miopenTensorMhaZInv, float{})); + EXPECT_LT(ZInv_error, error_threshold); + + double O_error = miopen::rms_range(oDesc_ref, GetResult(miopenTensorMhaO, T{})); + EXPECT_LT(O_error, fp8_error_threshold); } void TearDown() override @@ -304,6 +388,7 @@ class Test_Fwd_Mha : public testing::TestWithParam tensor zInvDesc_ref; float amaxS_ref; float amaxO_ref; + float dropout; MhaDescriptor mha_descriptor; miopenProblem_t problem = nullptr; @@ -313,6 +398,74 @@ class GPU_Fwd_Mha_FP32 : public Test_Fwd_Mha { }; +class GPU_Fwd_Mha_FP16 : public Test_Fwd_Mha +{ + void SetUp() override + { + if(!IsTestSupportedByDevice(Gpu::gfx90A | Gpu::gfx94X)) + { + GTEST_SKIP() << "FP16 is unsupported on this HW"; + } + + Test_Fwd_Mha::SetUp(); + + if(dropout != 0.0f) + { + GTEST_SKIP() << "Dropout not currently supported for FP16"; + } + } + + void RunReference(const tensor& q_val, + const tensor& k_val, + const tensor& v_val, + tensor& softmax, + tensor& attn_max, + tensor& Z_sum, + [[maybe_unused]] float q_descale, + [[maybe_unused]] float k_descale, + [[maybe_unused]] float v_descale, + [[maybe_unused]] float s_descale, + [[maybe_unused]] float s_scale, + [[maybe_unused]] float o_scale, + [[maybe_unused]] float dropout_rate, + [[maybe_unused]] uint64_t seed, + [[maybe_unused]] uint64_t offset, + [[maybe_unused]] float& aMax_S, + [[maybe_unused]] float& aMax_O, + tensor& output) override + { + test::cpu::MultiHeadAttentionForwardfp16( + q_val, k_val, v_val, softmax, attn_max, Z_sum, output); + } + + void VerifyResults(Handle& handle) override + { + auto GetResult = [this, &handle](miopenTensorArgumentId_t id) { + auto& tensorStructPtr = tensors[id]; + auto& cpu_tensor = std::get>(tensorStructPtr->m_cpu_tensor); + + cpu_tensor.data = handle.Read(tensorStructPtr->m_gpu_buffer, + cpu_tensor.data.size()); + + return cpu_tensor; + }; + + const double errorThreshold = 4e-4; + double oError = miopen::rms_range(oDesc_ref, GetResult(miopenTensorMhaO)); + + if(dropout > 0.0f) + { + // Due to GPU version using a different dropout generator we will compare to CPU without + // dropout and verify that dropout causes a large difference when comparing results. + EXPECT_GT(oError, errorThreshold); + } + else + { + EXPECT_LT(oError, errorThreshold); + } + } +}; + class GPU_Fwd_Mha_FP8 : public Test_Fwd_Mha { void SetUp() override @@ -334,6 +487,12 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Fwd_Mha_FP32, testing::ValuesIn(GetSmokeCase INSTANTIATE_TEST_SUITE_P(Full, GPU_Fwd_Mha_FP32, testing::ValuesIn(GetFullTestCases())); GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GPU_Fwd_Mha_FP32); +TEST_P(GPU_Fwd_Mha_FP16, Test_float) { return GPU_Fwd_Mha_FP16::TestBody(); }; + +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Fwd_Mha_FP16, testing::ValuesIn(GetFp16SmokeCases())); +INSTANTIATE_TEST_SUITE_P(Full, GPU_Fwd_Mha_FP16, testing::ValuesIn(GetFp16FullTestCases())); +GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(GPU_Fwd_Mha_FP16); + TEST_P(GPU_Fwd_Mha_FP8, Test_float) { return Test_Fwd_Mha::TestBody(); }; INSTANTIATE_TEST_SUITE_P(Smoke, GPU_Fwd_Mha_FP8, testing::ValuesIn(GetSmokeCases())); diff --git a/test/gtest/graphapi_capi_mha_backward.cpp b/test/gtest/graphapi_capi_mha_backward.cpp index 874ea075bb..814b09a051 100644 --- a/test/gtest/graphapi_capi_mha_backward.cpp +++ b/test/gtest/graphapi_capi_mha_backward.cpp @@ -95,7 +95,7 @@ class MhaBackwardTest : public MhaCommonTest // proper O, M and zInv tensors are required for backward pass. // randomly generated M and zInv may lead to nan\inf values - test::cpu::MultiHeadAttentionfp8( + test::cpu::MultiHeadAttentionForwardfp8( GetTensor(m_realTensorMap[miopenTensorMhaQ]->m_tensorVariant), GetTensor(m_realTensorMap[miopenTensorMhaK]->m_tensorVariant), GetTensor(m_realTensorMap[miopenTensorMhaV]->m_tensorVariant), diff --git a/test/gtest/graphapi_capi_mha_common.hpp b/test/gtest/graphapi_capi_mha_common.hpp index a76cb2e7d2..c30e0401a2 100644 --- a/test/gtest/graphapi_capi_mha_common.hpp +++ b/test/gtest/graphapi_capi_mha_common.hpp @@ -165,7 +165,11 @@ class DescriptorWrapper std::vector m_refsToKeep; }; -using TensorVariant = std::variant, tensor, tensor, tensor>; +using TensorVariant = std::variant, + tensor, + tensor, + tensor, + tensor>; template tensor& GetTensor(TensorVariant& var) @@ -211,6 +215,10 @@ miopenDataType_t GetMainType() { return miopenBFloat8; } + else if(std::is_same_v) + { + return miopenHalf; + } assert(false); return miopenFloat; @@ -660,6 +668,10 @@ class MhaCommonTest : public testing::TestWithParamm_tensorVariant = tensor{n, h, s, d}; } + else if(dtype == miopenHalf) + { + tensorDataPtr->m_tensorVariant = tensor{n, h, s, d}; + } else { assert(false); diff --git a/test/gtest/graphapi_capi_mha_forward.cpp b/test/gtest/graphapi_capi_mha_forward.cpp index aa10817061..d00ff4c180 100644 --- a/test/gtest/graphapi_capi_mha_forward.cpp +++ b/test/gtest/graphapi_capi_mha_forward.cpp @@ -230,7 +230,7 @@ class MhaForwardTest : public MhaCommonTest return it->second->m_tensorVariant; }; - test::cpu::MultiHeadAttentionfp8( + test::cpu::MultiHeadAttentionForwardfp8( GetTensor(lookup(miopenTensorMhaQ)), GetTensor(lookup(miopenTensorMhaK)), GetTensor(lookup(miopenTensorMhaV)), @@ -296,7 +296,55 @@ class GPU_MhaForward_FP8 : public MhaForwardTest } }; +class GPU_MhaForward_FP16 : public MhaForwardTest +{ + void SetUp() override + { + if(!IsTestSupportedByDevice(Gpu::gfx90A | Gpu::gfx94X)) + { + GTEST_SKIP() << "FP16 is unsupported on this HW"; + } + + MhaForwardTest::SetUp(); + + if(m_bernulliProbability != 0.0f) + { + GTEST_SKIP() << "Dropout not currently supported for FP16"; + } + } + + void RunCPUverify(miopen::Handle& handle) override + { + auto softmaxRef = tensor{m_testN, m_testH, m_testS, m_testS}; + auto oDescRef = tensor{m_testN, m_testH, m_testS, m_testD}; + auto mDescRef = tensor{m_testN, m_testH, m_testS, 1}; + auto zInvDescRef = tensor{m_testN, m_testH, m_testS, 1}; + + auto lookup = [this](const int64_t id) -> TensorVariant& { + auto it = m_realTensorMap.find(id); + assert(it != m_realTensorMap.cend()); + return it->second->m_tensorVariant; + }; + + test::cpu::MultiHeadAttentionForwardfp16( + GetTensor(lookup(miopenTensorMhaQ)), + GetTensor(lookup(miopenTensorMhaK)), + GetTensor(lookup(miopenTensorMhaV)), + softmaxRef, + mDescRef, + zInvDescRef, + oDescRef); + + const double errorThreshold = 4e-4; + + auto oRes = GetResult(miopenTensorMhaO, handle); + double oError = miopen::rms_range(oDescRef, oRes); + EXPECT_LT(oError, errorThreshold); + } +}; + TEST_P(GPU_MhaForward_FP32, TestFloat) { Run(); } +TEST_P(GPU_MhaForward_FP16, TestFloat16) { Run(); } TEST_P(GPU_MhaForward_FP8, TestFloat8) { Run(); } inline auto GetCases() @@ -310,4 +358,6 @@ inline auto GetCases() INSTANTIATE_TEST_SUITE_P(Smoke, GPU_MhaForward_FP32, GetCases()); +INSTANTIATE_TEST_SUITE_P(Smoke, GPU_MhaForward_FP16, GetCases()); + INSTANTIATE_TEST_SUITE_P(Smoke, GPU_MhaForward_FP8, GetCases()); diff --git a/test/gtest/graphapi_mha_cpp_common.hpp b/test/gtest/graphapi_mha_cpp_common.hpp index ec385ab360..48118fdfa5 100644 --- a/test/gtest/graphapi_mha_cpp_common.hpp +++ b/test/gtest/graphapi_mha_cpp_common.hpp @@ -383,24 +383,24 @@ class MhaGraphTestBase CpuMhaFwdOut out(n, h, s, d); - test::cpu::MultiHeadAttentionfp8(lookup_f("Q"), - lookup_f("K"), - lookup_f("V"), - out.mSoftMax, - out.mM, - out.mZinv, - lookup_f("DSCL_Q")[0], - lookup_f("DSCL_K")[0], - lookup_f("DSCL_V")[0], - lookup_f("DSCL_S")[0], - lookup_f("SCL_S")[0], - lookup_f("SCL_O")[0], - lookup_f("RND_PRB")[0], - lookup_i("RND_SD")[0], - lookup_i("RND_OFF")[0], - out.mAmaxS, - out.mAmaxO, - out.mO); + test::cpu::MultiHeadAttentionForwardfp8(lookup_f("Q"), + lookup_f("K"), + lookup_f("V"), + out.mSoftMax, + out.mM, + out.mZinv, + lookup_f("DSCL_Q")[0], + lookup_f("DSCL_K")[0], + lookup_f("DSCL_V")[0], + lookup_f("DSCL_S")[0], + lookup_f("SCL_S")[0], + lookup_f("SCL_O")[0], + lookup_f("RND_PRB")[0], + lookup_i("RND_SD")[0], + lookup_i("RND_OFF")[0], + out.mAmaxS, + out.mAmaxO, + out.mO); return out; } diff --git a/test/gtest/mha_helper.hpp b/test/gtest/mha_helper.hpp index 89114160c3..fc8dd4d0fd 100644 --- a/test/gtest/mha_helper.hpp +++ b/test/gtest/mha_helper.hpp @@ -139,8 +139,8 @@ void Dot_4D_T_4D(const tensor& A_mat, const tensor& B_mat, tensor& C }); } -template -void Dot_4D_4D(const tensor& A_mat, const tensor& B_mat, tensor& C_mat) +template +void Dot_4D_4D(const tensor& A_mat, const tensor& B_mat, tensor& C_mat) { size_t k_val = A_mat.desc.GetLengths()[3]; assert(k_val == B_mat.desc.GetLengths()[2]); @@ -151,7 +151,7 @@ void Dot_4D_4D(const tensor& A_mat, const tensor& B_mat, tensor& C_m sum += double(A_mat(b_id, h_id, sl_id, k_id)) * double(B_mat(b_id, h_id, k_id, dk_id)); } - C_mat(b_id, h_id, sl_id, dk_id) = T2(sum); + C_mat(b_id, h_id, sl_id, dk_id) = T3(sum); }); } @@ -358,26 +358,26 @@ void SoftMax(const tensor& q_dot_k_transpose, } template -void MultiHeadAttentionfp8(const tensor& q_val, - const tensor& k_val, - const tensor& v_val, - tensor& softmax, - tensor& attn_max, - tensor& Z_sum, - float q_descale, - float k_descale, - float v_descale, - float s_descale, - float s_scale, - float o_scale, - float dropout_rate, - uint64_t seed, - uint64_t offset, - float& aMax_S, - float& aMax_O, - tensor& multi_head_attention_fp8, - const tensor* optional_bias = - nullptr) // pointer to optional bias, nullptr if not provided +void MultiHeadAttentionForwardfp8(const tensor& q_val, + const tensor& k_val, + const tensor& v_val, + tensor& softmax, + tensor& attn_max, + tensor& Z_sum, + float q_descale, + float k_descale, + float v_descale, + float s_descale, + float s_scale, + float o_scale, + float dropout_rate, + uint64_t seed, + uint64_t offset, + float& aMax_S, + float& aMax_O, + tensor& multi_head_attention_fp8, + const tensor* optional_bias = + nullptr) // pointer to optional bias, nullptr if not provided { auto inputLengths = q_val.desc.GetLengths(); inputLengths[3] = inputLengths[2]; // NHSD converting to NHSS @@ -424,7 +424,34 @@ void MultiHeadAttentionfp8(const tensor& q_val, } template -void MultiHeadAttentionf32( +void MultiHeadAttentionForwardfp16(const tensor& q_val, + const tensor& k_val, + const tensor& v_val, + tensor& softmax, + tensor& attn_max, + tensor& Z_sum, + tensor& multi_head_attention, + const tensor* optional_bias = + nullptr) // pointer to optional bias, nullptr if not provided +{ + auto inputLengths = q_val.desc.GetLengths(); + inputLengths[3] = inputLengths[2]; // NHSD converting to NHSS + tensor q_dot_k_transpose(inputLengths); + + Dot_4D_4D_T(q_val, k_val, q_dot_k_transpose); + + if(optional_bias != nullptr) + { + PointWiseAdd(q_dot_k_transpose, *optional_bias, q_dot_k_transpose); + } + + SoftMax(q_dot_k_transpose, softmax, attn_max, Z_sum); + + Dot_4D_4D(softmax, v_val, multi_head_attention); +} + +template +void MultiHeadAttentionForwardf32( const tensor& q_val, const tensor& k_val, const tensor& v_val, From 38258d5f68a5e5c47183a60a8f078d305f62f8d9 Mon Sep 17 00:00:00 2001 From: Evgenii Averin <86725875+averinevg@users.noreply.github.com> Date: Fri, 18 Oct 2024 00:57:18 +0200 Subject: [PATCH 4/4] [tests] Unit tests for convolution solvers from #1911, part 2 (#3318) --- src/include/miopen/conv/solvers.hpp | 153 +++++++------- src/include/miopen/generic_search.hpp | 19 +- src/include/miopen/solver.hpp | 181 +++++++--------- src/solver.cpp | 10 + src/solver/conv/conv_ocl_dir2D_bwdWrW_2.cpp | 13 +- src/solver/conv/conv_ocl_dir2Dfwd1x1.cpp | 24 ++- .../conv_ocl_dir2Dfwd_exhaustive_search.cpp | 25 +++ test/gtest/regression_half_vega_gfx908.cpp | 80 ------- test/gtest/solver_bwd.hpp | 2 +- test/gtest/solver_fwd.hpp | 2 +- test/gtest/solver_wrw.hpp | 2 +- test/gtest/unit_conv_solver.cpp | 195 ++++++++++++------ test/gtest/unit_conv_solver.hpp | 54 +++-- ...nit_conv_solver_ConvDirectNaiveConvBwd.cpp | 26 ++- ...nit_conv_solver_ConvDirectNaiveConvFwd.cpp | 34 +-- ...nit_conv_solver_ConvDirectNaiveConvWrw.cpp | 24 ++- .../gtest/unit_conv_solver_ConvOclBwdWrW2.hpp | 108 ++++++++++ .../unit_conv_solver_ConvOclBwdWrW2N1.cpp | 35 +--- .../unit_conv_solver_ConvOclBwdWrW2N16.cpp | 28 +++ .../unit_conv_solver_ConvOclBwdWrW2N2.cpp | 28 +++ .../unit_conv_solver_ConvOclBwdWrW2N4.cpp | 28 +++ .../unit_conv_solver_ConvOclBwdWrW2N8.cpp | 28 +++ ...t_conv_solver_ConvOclBwdWrW2NonTunable.cpp | 97 +++++++++ .../unit_conv_solver_ConvOclDirectFwd.cpp | 131 ++++++++++++ .../unit_conv_solver_ConvOclDirectFwd1x1.cpp | 179 ++++++++++++++++ .../unit_conv_solver_ConvWinoFuryRxS.cpp | 17 +- .../unit_conv_solver_GemmBwd1x1_stride1.cpp | 17 +- .../unit_conv_solver_GemmBwd1x1_stride2.cpp | 17 +- test/gtest/unit_conv_solver_GemmBwdRest.cpp | 19 +- .../gtest/unit_conv_solver_GemmFwd1x1_0_1.cpp | 17 +- .../unit_conv_solver_GemmFwd1x1_0_1_int8.cpp | 13 +- .../gtest/unit_conv_solver_GemmFwd1x1_0_2.cpp | 17 +- test/gtest/unit_conv_solver_GemmFwdRest.cpp | 27 ++- .../unit_conv_solver_GemmWrw1x1_stride1.cpp | 17 +- .../unit_conv_solver_GemmWrwUniversal.cpp | 17 +- 35 files changed, 1206 insertions(+), 478 deletions(-) delete mode 100644 test/gtest/regression_half_vega_gfx908.cpp create mode 100644 test/gtest/unit_conv_solver_ConvOclBwdWrW2.hpp rename src/include/miopen/type_traits.hpp => test/gtest/unit_conv_solver_ConvOclBwdWrW2N1.cpp (62%) create mode 100644 test/gtest/unit_conv_solver_ConvOclBwdWrW2N16.cpp create mode 100644 test/gtest/unit_conv_solver_ConvOclBwdWrW2N2.cpp create mode 100644 test/gtest/unit_conv_solver_ConvOclBwdWrW2N4.cpp create mode 100644 test/gtest/unit_conv_solver_ConvOclBwdWrW2N8.cpp create mode 100644 test/gtest/unit_conv_solver_ConvOclBwdWrW2NonTunable.cpp create mode 100644 test/gtest/unit_conv_solver_ConvOclDirectFwd.cpp create mode 100644 test/gtest/unit_conv_solver_ConvOclDirectFwd1x1.cpp diff --git a/src/include/miopen/conv/solvers.hpp b/src/include/miopen/conv/solvers.hpp index 699f267b99..9713d27135 100644 --- a/src/include/miopen/conv/solvers.hpp +++ b/src/include/miopen/conv/solvers.hpp @@ -52,6 +52,12 @@ namespace debug { MIOPEN_EXPORT extern bool AlwaysEnableConvDirectNaive; // NOLINT (cppcoreguidelines-avoid-non-const-global-variables) +/// WORKAROUND_SWDEV_271887 disables ConvOclDirectFwd1x1 solver on gfx10 due to precision issues. +/// However we still want to check that the solver is not broken and therefore use +/// disable_wa_swdev_271887 = true to enable it. +// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) +MIOPEN_INTERNALS_EXPORT extern bool disable_wa_swdev_271887; + } // namespace debug struct AnyInvokeParams; @@ -62,16 +68,24 @@ const int wave_size = 64; namespace conv { -/// Base class for convolution tunable and non-tunable solvers -using ConvSolverBase = SolverMixin; +/// Common interface for convolution tunable and non-tunable solvers +using ConvSolverInterface = SolverInterface; + +/// Common interface for convolution non-tunable solvers +using ConvSolverInterfaceNonTunable = + SolverInterfaceNonTunable; + +/// Common interface for convolution tunable solvers +using ConvSolverInterfaceTunable = + SolverInterfaceTunable; /// Typedef for convolution non-tunable solvers -using ConvSolver = NonTunableSolverBase; +using ConvSolver = SolverBaseNonTunable; /// Typedef for convolution tunable solvers template using ConvTunableSolver = - TunableSolverMixin; + SolverBaseTunable; struct PerformanceConfigConvAsm3x3U : PerfConfigBase { @@ -159,7 +173,6 @@ struct PerformanceConfigConvAsm1x1U : PerfConfigBase { - MIOPEN_INTERNALS_EXPORT LegacyPerformanceConfig GetDefaultPerformanceConfig( - const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT LegacyPerformanceConfig - Search(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const AnyInvokeParams& invoke_ctx) const override; + LegacyPerformanceConfig + GetDefaultPerformanceConfig(const ExecutionContext&, + const miopen::conv::ProblemDescription&) const override; + LegacyPerformanceConfig Search(const ExecutionContext&, + const miopen::conv::ProblemDescription&, + const AnyInvokeParams& invoke_ctx) const override; private: template @@ -1753,35 +1771,33 @@ struct ConvOclDirectFwdLegacyExhaustiveSearch : ConvTunableSolver(); } - MIOPEN_INTERNALS_EXPORT static ConvSolution - BaseGetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const LegacyPerformanceConfig&); + static ConvSolution BaseGetSolution(const ExecutionContext&, + const miopen::conv::ProblemDescription&, + const LegacyPerformanceConfig&); - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const LegacyPerformanceConfig&) const override; - MIOPEN_INTERNALS_EXPORT bool - IsValidPerformanceConfig(const ExecutionContext&, + bool IsApplicable(const ExecutionContext&, + const miopen::conv::ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, const miopen::conv::ProblemDescription&, const LegacyPerformanceConfig&) const override; + bool IsValidPerformanceConfig(const ExecutionContext&, + const miopen::conv::ProblemDescription&, + const LegacyPerformanceConfig&) const override; }; -struct ConvOclDirectFwd1x1 final : ConvOclDirectFwdLegacyExhaustiveSearch +struct MIOPEN_INTERNALS_EXPORT ConvOclDirectFwd1x1 final : ConvOclDirectFwdLegacyExhaustiveSearch { const std::string& SolverDbId() const override { return GetSolverDbId(); } - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&, - const LegacyPerformanceConfig&) const override; + bool IsApplicable(const ExecutionContext&, + const miopen::conv::ProblemDescription&) const override; + ConvSolution GetSolution(const ExecutionContext&, + const miopen::conv::ProblemDescription&, + const LegacyPerformanceConfig&) const override; bool IsValidPerformanceConfig(const ExecutionContext&, const miopen::conv::ProblemDescription&, @@ -2194,14 +2210,17 @@ struct PerformanceConfigAsmDirect3x3WrW : PerfConfigBase& other) const; + void HeuristicInit(const miopen::conv::ProblemDescription&); + bool IsValidValue() const; + bool SetNextValue(const miopen::conv::ProblemDescription&); + bool IsValid(const ExecutionContext&, const miopen::conv::ProblemDescription&) const; + bool operator==(const PerformanceConfigConvOclBwdWrw2& other) const; }; template -struct ConvOclBwdWrW2 : ConvTunableSolver> +struct ConvOclBwdWrW2 final : ConvTunableSolver> { - const std::string& SolverDbId() const override + MIOPEN_INTERNALS_EXPORT const std::string& SolverDbId() const override { return this->template GetSolverDbId>(); } @@ -2462,7 +2482,7 @@ struct ConvOclBwdWrW2 : ConvTunableSolver; /// Basically, this is *hack* for non-group 3x3 and 1x1 cases. /// It is assumed that Solutions provided by the ConvOclBwdWrW2 solver /// would never beat 3x3 and 1x1 assembly WrW kernels, even after tuning. -struct ConvOclBwdWrW2NonTunable final : ConvOclBwdWrW2<1> +struct MIOPEN_INTERNALS_EXPORT ConvOclBwdWrW2NonTunable final : ConvSolver { const std::string& SolverDbId() const override { return GetSolverDbId(); } - MIOPEN_INTERNALS_EXPORT bool - IsApplicable(const ExecutionContext&, const miopen::conv::ProblemDescription&) const override; - MIOPEN_INTERNALS_EXPORT ConvSolution GetSolution(const ExecutionContext&, - const miopen::conv::ProblemDescription&) const; - InvokerFactory GetInvokerFactory(const ExecutionContext& ctx, - const miopen::conv::ProblemDescription& problem) const - { - return *GetSolution(ctx, problem).invoker_factory; - } - -private: - // This function dervied from ConvOclBwdWrW2 is declared private - // so that this solver is not marked searchable/tunable. - using ConvOclBwdWrW2<1>::GetDefaultPerformanceConfig; - using ConvOclBwdWrW2<1>::GetSolution; - using ConvOclBwdWrW2<1>::GetInvokerFactory; + bool IsApplicable(const ExecutionContext&, + const miopen::conv::ProblemDescription&) const override; + size_t GetWorkspaceSize(const ExecutionContext&, + const miopen::conv::ProblemDescription&) const override; + bool MayNeedWorkspace() const override { return true; } + ConvSolution GetSolution(const ExecutionContext&, + const miopen::conv::ProblemDescription&) const override; }; struct ConvOclBwdWrW53 final : ConvSolver diff --git a/src/include/miopen/generic_search.hpp b/src/include/miopen/generic_search.hpp index e960c04018..a9cb6e4ae9 100644 --- a/src/include/miopen/generic_search.hpp +++ b/src/include/miopen/generic_search.hpp @@ -36,7 +36,6 @@ #include #include #include -#include #include #include @@ -55,6 +54,7 @@ namespace solver { namespace debug { // This struct is not MT-safe, meaning one should use it before starting threads, thus avoiding // constructing it inside a worker thread. +/// \todo This class should be moved out of the library struct MIOPEN_INTERNALS_EXPORT TuningIterationScopedLimiter { TuningIterationScopedLimiter(std::size_t new_limit); @@ -241,7 +241,6 @@ class HeartBeat /// - Its return type shall be suitable for instantiation of the ComputedContainer. /// * GetSolution shall be implemented. /// * Solution should provide invoker -/// * RunAndMeasureSolution must NOT be implemented. Invoker will be used instead. /// /// clang-format-off /// ----------------------------------------------- @@ -263,17 +262,6 @@ class HeartBeat /// ------------------------------------------------ /// clang-format-on -template -using RunAndMeasure_t = - decltype(std::declval().RunAndMeasureSolution(std::declval(), - std::declval(), - std::declval(), - std::declval(), - std::declval(), - std::declval(), - std::declval(), - std::declval())); - template auto GetAllConfigs(const Solver s, const Context& context, const Problem& problem) -> ComputedContainer decltype(s.GetDefaultPerformanceConfig(context_, problem)) { - static_assert( - !(HasMember{} || - HasMember{}), - "RunAndMeasure is obsolete. Solvers should implement auto-tune evaluation in invoker"); - auto context = context_; context.is_for_generic_search = true; diff --git a/src/include/miopen/solver.hpp b/src/include/miopen/solver.hpp index 69f47f6ed6..32a29ead6e 100644 --- a/src/include/miopen/solver.hpp +++ b/src/include/miopen/solver.hpp @@ -35,14 +35,21 @@ #include #include -#include - #include #include #include namespace miopen { +namespace debug { + +/// Enables deprecated solvers. +/// This variable is intended for use in unit tests. +// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) +MIOPEN_INTERNALS_EXPORT extern bool enable_deprecated_solvers; + +} // namespace debug + struct AnyInvokeParams; namespace solver { @@ -72,35 +79,12 @@ struct SolverBase return null_id; } - /// Returns true if solution can work on given SW/HW platform (runtime/device) - /// and provides correct result for the problem config. - /// - /// Every SolverBase which IsApplicable() for some problem config must be able to - /// GetDefaultPerformanceConfig() so that GetSolution() would return valid - /// solution for a problem (i.e. convolution). In other words, if a Solution - /// says "I'm suitable" for a problem, it agrees to solve that problem correctly. - virtual bool IsApplicable(const ExecutionContext& ctx, const boost::any& problem) const = 0; - /// [Informative as of Sep 2020] The minimum requirement for Dynamic Solvers: /// Batch size and input picture size (N, W, H) must NOT be compiled into the /// kernel(s) that consist a Solution. These must go into the kernel as a /// run-time parameters. virtual bool IsDynamic() const { return false; } - static constexpr float wti_approximate_worst = -2; - - /// [Informative as of Sep 2020] Returns an approximated value of the expected - /// WTI or wti_approximate_worst when this value can't be computed. Tips: - /// * Value 1.0 corresponds to the 100% utilization of HW capabilities as - /// if Direct computational algorithm is used. - /// * [Notice] WTI may exceed 1.0 for highly optimized algorithms like Winograd. - /// * @see https://github.com/ROCm/MIOpen/issues/410 - virtual float GetWti(const ExecutionContext& ctx, const boost::any& problem) const = 0; - - /// Returns the workspace size required by the solver for a given ExecutionContext - virtual size_t GetWorkspaceSize(const ExecutionContext& ctx, - const boost::any& problem) const = 0; - /// Must return true if a Solver has its own implementation of GetWorkspaceSize(). virtual bool MayNeedWorkspace() const { return false; } @@ -126,45 +110,59 @@ struct SolverBase } }; +/// Common interface for tunable and non-tunable solvers template -struct SolverMixin : SolverBase +struct SolverInterface : SolverBase { static_assert(std::is_base_of{}, "Context must be derived of ExecutionContext"); - virtual bool IsApplicable(const Context&, const Problem&) const = 0; - virtual float GetWti(const Context&, const Problem&) const { return wti_approximate_worst; }; - virtual size_t GetWorkspaceSize(const Context&, const Problem&) const { return 0; }; + /// Returns true if a Solver can work on given SW/HW platform (runtime/device) + /// and provides correct result for the problem config. + virtual bool IsApplicable(const Context& ctx, const Problem& problem) const = 0; - bool IsApplicable(const ExecutionContext& ctx, const boost::any& problem) const final - { - return IsApplicable(dynamic_cast(ctx), - boost::any_cast(problem)); - } + static constexpr float wti_approximate_worst = -2; - float GetWti(const ExecutionContext& ctx, const boost::any& problem) const final - { - return GetWti(dynamic_cast(ctx), boost::any_cast(problem)); - } + /// [Informative as of Sep 2020] Returns an approximated value of the expected + /// WTI or wti_approximate_worst when this value can't be computed. Tips: + /// * Value 1.0 corresponds to the 100% utilization of HW capabilities as + /// if Direct computational algorithm is used. + /// * [Notice] WTI may exceed 1.0 for highly optimized algorithms like Winograd. + /// * @see https://github.com/ROCm/MIOpen/issues/410 + virtual float GetWti(const Context&, const Problem&) const { return wti_approximate_worst; }; - size_t GetWorkspaceSize(const ExecutionContext& ctx, const boost::any& problem) const final - { - return GetWorkspaceSize(dynamic_cast(ctx), - boost::any_cast(problem)); - } + /// Returns the workspace size required by the solver for the given Problem + virtual size_t GetWorkspaceSize(const Context&, const Problem&) const { return 0; }; }; -/// Base class for non tunable solvers +/// Common interface for non-tunable solvers template -struct NonTunableSolverBase : SolverMixin +struct SolverInterfaceNonTunable : SolverInterface { /// Takes problem config, optimization parameters and other info /// and computes information required to build and run the kernel(s). - virtual ConvSolution GetSolution(const Context&, const Problem&) const = 0; + virtual ConvSolution GetSolution(const Context& ctx, const Problem& problem) const = 0; +}; + +/// Common interface for tunable solvers +template +struct SolverInterfaceTunable : SolverInterface +{ + /// This function is a simplified version of FindSolution(), it does not obey search parameters + /// from the Context and does not use the database. Intended to be used in unit tests. + virtual ConvSolution FindSolutionSimple(const Context& ctx, + const Problem& problem, + const AnyInvokeParams& invoke_ctx) const = 0; +}; - virtual InvokerFactory GetInvokerFactory(const Context& ctx, const Problem& problem) const +/// Base class for non-tunable solvers +template +struct SolverBaseNonTunable : SolverInterfaceNonTunable +{ + InvokerFactory GetInvokerFactory(const Context& ctx, const Problem& problem) const { - return *GetSolution(ctx, problem).invoker_factory; + const auto solution = this->GetSolution(ctx, problem); + return *solution.invoker_factory; } }; @@ -173,90 +171,55 @@ struct TunableSolverTrait }; /// Base class for tunable solvers -template -struct TunableSolverBase : SolverMixin, TunableSolverTrait +template +struct SolverBaseTunable : SolverInterfaceTunable, TunableSolverTrait { /// Initializes performance config to the default values. /// The function may involve some heuristic to guess the best solution /// configuration. It is assumed that the function takes constant time /// to finish and does not run kernels to measure performance etc. /// The function shall always return valid config. - /// - /// The int parameter is needed only to not change the name of the - /// function in the derived class. Function declarations that differ - /// only by its return type cannot be overloaded. - virtual boost::any - GetDefaultPerformanceConfig(const Context& ctx, const Problem& problem, int) const = 0; + virtual PerformanceConfig GetDefaultPerformanceConfig(const Context& ctx, + const Problem& problem) const = 0; /// Should return false if performance config is wrong for a problem. /// Main use is validation of values read from the perf db. virtual bool IsValidPerformanceConfig(const Context& ctx, const Problem& problem, - const PerfConfig& config) const = 0; + const PerformanceConfig& config) const = 0; /// Search - /// - /// The int parameter is needed only to not change the name of the - /// function in the derived class. Function declarations that differ - /// only by its return type cannot be overloaded. - virtual boost::any Search(const Context& ctx, - const Problem& problem, - const AnyInvokeParams& invoke_ctx, - int) const = 0; + virtual PerformanceConfig + Search(const Context& ctx, const Problem& problem, const AnyInvokeParams& invoke_ctx) const = 0; /// Tunable solvers provide a GetSolution that takes a Context and PerformanceConfig - virtual ConvSolution - GetSolution(const Context& ctx, const Problem& problem, const PerfConfig& config) const = 0; + virtual ConvSolution GetSolution(const Context& ctx, + const Problem& problem, + const PerformanceConfig& config) const = 0; - virtual InvokerFactory - GetInvokerFactory(const Context& ctx, const Problem& problem, const PerfConfig& config) const + ConvSolution FindSolutionSimple(const Context& ctx, + const Problem& problem, + const AnyInvokeParams& invoke_ctx) const final { - return *GetSolution(ctx, problem, config).invoker_factory; + const PerformanceConfig config = Search(ctx, problem, invoke_ctx); + return GetSolution(ctx, problem, config); } -}; -template -struct TunableSolverMixin : TunableSolverBase -{ - static_assert(std::is_base_of{}, - "PerformanceConfig must be derived of PerfConfig"); - - virtual PerformanceConfig GetDefaultPerformanceConfig(const Context&, const Problem&) const = 0; - virtual bool - IsValidPerformanceConfig(const Context&, const Problem&, const PerformanceConfig&) const = 0; - virtual PerformanceConfig - Search(const Context&, const Problem&, const AnyInvokeParams&) const = 0; - virtual ConvSolution - GetSolution(const Context&, const Problem&, const PerformanceConfig&) const = 0; - - boost::any - GetDefaultPerformanceConfig(const Context& ctx, const Problem& problem, int) const final - { - return GetDefaultPerformanceConfig(ctx, problem); - } - - bool IsValidPerformanceConfig(const Context& ctx, - const Problem& problem, - const PerfConfig& config) const final + InvokerFactory GetInvokerFactory(const Context& ctx, + const Problem& problem, + const PerformanceConfig& config) const { - return IsValidPerformanceConfig( - ctx, problem, dynamic_cast(config)); + return *GetSolution(ctx, problem, config).invoker_factory; } +}; - boost::any Search(const Context& ctx, - const Problem& problem, - const AnyInvokeParams& invoke_ctx, - int) const final - { - return Search(ctx, problem, invoke_ctx); - } +// \todo Should be removed +template +using NonTunableSolverBase = SolverBaseNonTunable; - ConvSolution - GetSolution(const Context& ctx, const Problem& problem, const PerfConfig& config) const final - { - return GetSolution(ctx, problem, dynamic_cast(config)); - } -}; +// \todo Should be removed +template +using TunableSolverMixin = SolverBaseTunable; template struct IsTunable : std::is_base_of diff --git a/src/solver.cpp b/src/solver.cpp index 912479b9f5..1f6873d5f7 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -59,6 +59,14 @@ MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_ENABLE_DEPRECATED_SOLVERS) namespace miopen { + +namespace debug { + +// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) +bool enable_deprecated_solvers = false; + +} // namespace debug + namespace solver { std::ostream& operator<<(std::ostream& os, const KernelInfo& k) @@ -708,6 +716,8 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) bool ThisSolverIsDeprecatedStatic::IsDisabled(const ExecutionContext& ctx) { static const bool device_is_allowed = [&]() { + if(miopen::debug::enable_deprecated_solvers) + return true; if(env::enabled(MIOPEN_DEBUG_ENABLE_DEPRECATED_SOLVERS)) return true; const auto device = ctx.GetStream().GetTargetProperties().Name(); diff --git a/src/solver/conv/conv_ocl_dir2D_bwdWrW_2.cpp b/src/solver/conv/conv_ocl_dir2D_bwdWrW_2.cpp index 8c898a8a44..744bc5bf09 100644 --- a/src/solver/conv/conv_ocl_dir2D_bwdWrW_2.cpp +++ b/src/solver/conv/conv_ocl_dir2D_bwdWrW_2.cpp @@ -148,7 +148,15 @@ bool ConvOclBwdWrW2NonTunable::IsApplicable(const ExecutionContext& ctx, // At present, auto-tuning is disabled for non-group 3x3 and 1x1 filters for multiple // reasons: after tuning ocl kernel for 3x3 and 1x1 filters, assembly kernel still // dominates. Thus, this solver is used for non-group 3x3 and 1x1 filters only. - return ConvOclBwdWrW2<1>::IsApplicableBase(ctx, problem) && !IsTunable(problem); + const auto tunable = ConvOclBwdWrW2<1>{}; + return tunable.IsApplicableBase(ctx, problem) && !IsTunable(problem); +} + +size_t ConvOclBwdWrW2NonTunable::GetWorkspaceSize(const ExecutionContext& ctx, + const ProblemDescription& problem) const +{ + const auto tunable = ConvOclBwdWrW2<1>{}; + return tunable.GetWorkspaceSize(ctx, problem); } ConvSolution ConvOclBwdWrW2NonTunable::GetSolution(const ExecutionContext& ctx, @@ -156,7 +164,8 @@ ConvSolution ConvOclBwdWrW2NonTunable::GetSolution(const ExecutionContext& ctx, { // Invoking base class GetSolution with default values for params obtained // from GetDefaultPerformanceConfig() - return ConvOclBwdWrW2<1>::GetSolution(ctx, problem, GetDefaultPerformanceConfig(ctx, problem)); + const auto tunable = ConvOclBwdWrW2<1>{}; + return tunable.GetSolution(ctx, problem, tunable.GetDefaultPerformanceConfig(ctx, problem)); } template diff --git a/src/solver/conv/conv_ocl_dir2Dfwd1x1.cpp b/src/solver/conv/conv_ocl_dir2Dfwd1x1.cpp index d443f47ced..5b151d4908 100644 --- a/src/solver/conv/conv_ocl_dir2Dfwd1x1.cpp +++ b/src/solver/conv/conv_ocl_dir2Dfwd1x1.cpp @@ -36,6 +36,19 @@ MIOPEN_DECLARE_ENV_VAR_BOOL(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1) namespace miopen { + +#if WORKAROUND_SWDEV_271887 +namespace debug { + +/// WORKAROUND_SWDEV_271887 disables ConvOclDirectFwd1x1 solver on gfx10 due to precision issues. +/// However we still want to check that the solver is not broken and therefore use +/// disable_wa_swdev_271887 = true to enable it. +// NOLINTNEXTLINE (cppcoreguidelines-avoid-non-const-global-variables) +bool disable_wa_swdev_271887 = false; + +} // namespace debug +#endif + namespace solver { namespace conv { @@ -45,11 +58,14 @@ bool ConvOclDirectFwd1x1::IsApplicable(const ExecutionContext& ctx, const ProblemDescription& problem) const { #if WORKAROUND_SWDEV_271887 - if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx10") || - StartsWith(ctx.GetStream().GetDeviceName(), "gfx11")) + if(!miopen::debug::disable_wa_swdev_271887) { - if(!env::enabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1)) - return false; + if(StartsWith(ctx.GetStream().GetDeviceName(), "gfx10") || + StartsWith(ctx.GetStream().GetDeviceName(), "gfx11")) + { + if(!env::enabled(MIOPEN_DEBUG_CONV_DIRECT_OCL_FWD1X1)) + return false; + } } #endif if(ThisSolverIsDeprecatedStatic::IsDisabled(ctx)) diff --git a/src/solver/conv/conv_ocl_dir2Dfwd_exhaustive_search.cpp b/src/solver/conv/conv_ocl_dir2Dfwd_exhaustive_search.cpp index 64b59353b2..213c989f37 100644 --- a/src/solver/conv/conv_ocl_dir2Dfwd_exhaustive_search.cpp +++ b/src/solver/conv/conv_ocl_dir2Dfwd_exhaustive_search.cpp @@ -45,6 +45,9 @@ namespace miopen { namespace solver { + +std::size_t GetTuningIterationsMax(); + namespace conv { using ProblemDescription = miopen::conv::ProblemDescription; @@ -462,9 +465,17 @@ ConvOclDirectFwdLegacyExhaustiveSearch::SearchImpl(const ExecutionContext& ctx, << "curr time: " << processing_time << ' ' << result); } run_counter++; + if(run_counter >= GetTuningIterationsMax()) + break; } + if(run_counter >= GetTuningIterationsMax()) + break; } + if(run_counter >= GetTuningIterationsMax()) + break; } + if(run_counter >= GetTuningIterationsMax()) + break; } } else @@ -609,12 +620,26 @@ ConvOclDirectFwdLegacyExhaustiveSearch::SearchImpl(const ExecutionContext& ctx, << result); } run_counter++; + if(run_counter >= GetTuningIterationsMax()) + break; } + if(run_counter >= GetTuningIterationsMax()) + break; } + if(run_counter >= GetTuningIterationsMax()) + break; } + if(run_counter >= GetTuningIterationsMax()) + break; } + if(run_counter >= GetTuningIterationsMax()) + break; } + if(run_counter >= GetTuningIterationsMax()) + break; } + if(run_counter >= GetTuningIterationsMax()) + break; } } // Compare search results vs. default performance config. diff --git a/test/gtest/regression_half_vega_gfx908.cpp b/test/gtest/regression_half_vega_gfx908.cpp deleted file mode 100644 index bef12f47bb..0000000000 --- a/test/gtest/regression_half_vega_gfx908.cpp +++ /dev/null @@ -1,80 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2023 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ -#include -#include - -#include "gtest_common.hpp" - -#include "../conv2d.hpp" - -namespace { - -auto GetTestCases() -{ - const auto env = std::tuple{std::pair{MIOPEN_FIND_MODE, "normal"}, - std::pair{MIOPEN_DEBUG_FIND_ONLY_SOLVER, "ConvOclDirectFwd1x1"}}; - - const std::string v = - " --verbose --disable-backward-data --disable-backward-weights " - "--disable-verification-cache --cmode conv --pmode default --group-count 1"; - - return std::vector{ - // clang-format off - std::pair{env, v + " --input 1 16 7 7 --weights 16 16 1 1 --pads_strides_dilations 0 0 1 1 1 1"} - // clang-format on - }; -} - -using TestCase = decltype(GetTestCases())::value_type; - -bool IsTestSupportedForDevice() -{ - // Issue #894. - // Can't be enabled for GFX10 due to WORKAROUND_SWDEV_271887 - using e_mask = enabled; - using d_mask = disabled; - return ::IsTestSupportedForDevMask(); -} - -} // namespace - -class GPU_Conv2dDefault_FP16 : public HalfTestCase> -{ -}; - -TEST_P(GPU_Conv2dDefault_FP16, HalfTest_regression_half_vega_gfx908) -{ - if(IsTestSupportedForDevice()) - { - invoke_with_params(default_check); - } - else - { - GTEST_SKIP(); - } -}; - -INSTANTIATE_TEST_SUITE_P(Full, GPU_Conv2dDefault_FP16, testing::Values(GetTestCases())); diff --git a/test/gtest/solver_bwd.hpp b/test/gtest/solver_bwd.hpp index 4458efb3ce..e9affc9f6a 100644 --- a/test/gtest/solver_bwd.hpp +++ b/test/gtest/solver_bwd.hpp @@ -40,7 +40,7 @@ template struct ConvBwdSolverTest : public ::testing::TestWithParam> { - void SolverBwd(const miopen::solver::conv::ConvSolverBase& solv) + void SolverBwd(const miopen::solver::conv::ConvSolverInterface& solv) { auto&& handle = get_handle(); diff --git a/test/gtest/solver_fwd.hpp b/test/gtest/solver_fwd.hpp index c6a074a16c..3a4ffc2143 100644 --- a/test/gtest/solver_fwd.hpp +++ b/test/gtest/solver_fwd.hpp @@ -39,7 +39,7 @@ struct ConvFwdSolverTest : public ::testing::TestWithParam>, ConvFwdSolverTestBase { - void SolverFwd(const miopen::solver::conv::ConvSolverBase& solv) + void SolverFwd(const miopen::solver::conv::ConvSolverInterface& solv) { auto&& handle = get_handle(); diff --git a/test/gtest/solver_wrw.hpp b/test/gtest/solver_wrw.hpp index 224ce56c43..1f2285bc05 100644 --- a/test/gtest/solver_wrw.hpp +++ b/test/gtest/solver_wrw.hpp @@ -40,7 +40,7 @@ template struct ConvWrwSolverTest : public ::testing::TestWithParam> { - void SolverWrw(const miopen::solver::conv::ConvSolverBase& solv) + void SolverWrw(const miopen::solver::conv::ConvSolverInterface& solv) { auto&& handle = get_handle(); diff --git a/test/gtest/unit_conv_solver.cpp b/test/gtest/unit_conv_solver.cpp index bdc434ac97..ffc1f99025 100644 --- a/test/gtest/unit_conv_solver.cpp +++ b/test/gtest/unit_conv_solver.cpp @@ -26,6 +26,7 @@ #include #include +#include #include "unit_conv_solver.hpp" @@ -41,6 +42,30 @@ namespace unit_tests { namespace { +class DeprecatedSolversScopedEnabler +{ +public: + DeprecatedSolversScopedEnabler() noexcept {} + DeprecatedSolversScopedEnabler(const DeprecatedSolversScopedEnabler&) = delete; + DeprecatedSolversScopedEnabler(DeprecatedSolversScopedEnabler&&) = delete; + DeprecatedSolversScopedEnabler& operator=(const DeprecatedSolversScopedEnabler&) = delete; + DeprecatedSolversScopedEnabler& operator=(DeprecatedSolversScopedEnabler&&) = delete; + ~DeprecatedSolversScopedEnabler() noexcept + { + if(prev) + miopen::debug::enable_deprecated_solvers = prev.value(); + } + + void Enable() noexcept + { + prev = miopen::debug::enable_deprecated_solvers; + miopen::debug::enable_deprecated_solvers = true; + } + +private: + std::optional prev; +}; + bool IsDeviceSupported(Gpu supported_devs, Gpu dev) { if((supported_devs & dev) != Gpu::None) @@ -161,8 +186,50 @@ std::ostream& operator<<(std::ostream& os, const ConvTestCase& tc) // Unit test for convolution solver //************************************************************************************ +UnitTestConvSolverParams::UnitTestConvSolverParams() : UnitTestConvSolverParams(Gpu::None) {} + +UnitTestConvSolverParams::UnitTestConvSolverParams(Gpu supported_devs_) + : supported_devs(supported_devs_), + use_cpu_ref(false), + enable_deprecated_solvers(false), + tunable(false) +{ +} + +void UnitTestConvSolverParams::UseCpuRef() { use_cpu_ref = true; } + +void UnitTestConvSolverParams::EnableDeprecatedSolvers() { enable_deprecated_solvers = true; } + +void UnitTestConvSolverParams::Tunable(std::size_t iterations_max_) +{ + tunable = true; + tuning_iterations_max = iterations_max_; +} + namespace { +miopen::solver::ConvSolution FindSolution(const miopen::solver::conv::ConvSolverInterface& solv, + const UnitTestConvSolverParams& params, + const miopen::ExecutionContext& ctx, + const miopen::conv::ProblemDescription& problem, + const AnyInvokeParams& invoke_ctx) +{ + if(params.tunable) + { + using IterationLimiter = miopen::solver::debug::TuningIterationScopedLimiter; + IterationLimiter tuning_limit{params.tuning_iterations_max}; + const auto& tunable_solv = + dynamic_cast(solv); + return tunable_solv.FindSolutionSimple(ctx, problem, invoke_ctx); + } + else + { + const auto& non_tunable_solv = + dynamic_cast(solv); + return non_tunable_solv.GetSolution(ctx, problem); + } +} + template double GetThreshold(miopenConvAlgorithm_t algo, miopen::conv::Direction direction) { @@ -176,6 +243,15 @@ double GetThreshold(miopenConvAlgorithm_t algo, miopen::conv::Direction directio } } + if constexpr(std::is_same_v) + { + if(algo == miopenConvolutionAlgoDirect && + direction == miopen::conv::Direction::BackwardWeights) + { + tolerance *= 2.0; + } + } + double threshold = std::numeric_limits::epsilon() * tolerance; return threshold; } @@ -221,10 +297,10 @@ void VerifyData(const std::vector& data, // Fwd //********************************** template -void RunSolverFwd(const miopen::solver::conv::ConvSolverBase& solv, +void RunSolverFwd(const miopen::solver::conv::ConvSolverInterface& solv, + const UnitTestConvSolverParams& params, const ConvTestCase& conv_config, - miopenConvAlgorithm_t algo, - bool use_cpu_ref) + miopenConvAlgorithm_t algo) { //********************************** // Prepare @@ -287,10 +363,7 @@ void RunSolverFwd(const miopen::solver::conv::ConvSolverBase& solv, const auto invoke_params = miopen::conv::DataInvokeParams{ tensors, wspace.ptr(), wspace.size(), conv_desc.attribute.gfx90aFp16alt.GetFwd()}; - // \todo add path for tunable solvers - const auto& conv_solv = dynamic_cast(solv); - - const auto sol = conv_solv.GetSolution(ctx, problem); + const auto sol = FindSolution(solv, params, ctx, problem, invoke_params); ASSERT_TRUE(sol.Succeeded()); ASSERT_TRUE(sol.invoker_factory); const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); @@ -302,7 +375,7 @@ void RunSolverFwd(const miopen::solver::conv::ConvSolverBase& solv, //********************************** auto ref_out = tensor{output.desc}; - if(use_cpu_ref) + if(params.use_cpu_ref) { cpu_convolution_forward(conv_desc.GetSpatialDimension(), input, @@ -324,22 +397,22 @@ void RunSolverFwd(const miopen::solver::conv::ConvSolverBase& solv, } template -void RunSolverFwd(const miopen::solver::conv::ConvSolverBase& solv, +void RunSolverFwd(const miopen::solver::conv::ConvSolverInterface& solv, + const UnitTestConvSolverParams& params, const ConvTestCase& conv_config, - miopenConvAlgorithm_t algo, - bool use_cpu_ref) + miopenConvAlgorithm_t algo) { - RunSolverFwd(solv, conv_config, algo, use_cpu_ref); + RunSolverFwd(solv, params, conv_config, algo); } //********************************** // Bwd //********************************** template -void RunSolverBwd(const miopen::solver::conv::ConvSolverBase& solv, +void RunSolverBwd(const miopen::solver::conv::ConvSolverInterface& solv, + const UnitTestConvSolverParams& params, const ConvTestCase& conv_config, - miopenConvAlgorithm_t algo, - bool use_cpu_ref) + miopenConvAlgorithm_t algo) { //********************************** // Prepare @@ -402,10 +475,7 @@ void RunSolverBwd(const miopen::solver::conv::ConvSolverBase& solv, const auto invoke_params = miopen::conv::DataInvokeParams{ tensors, wspace.ptr(), wspace.size(), conv_desc.attribute.gfx90aFp16alt.GetBwd()}; - // \todo add path for tunable solvers - const auto& conv_solv = dynamic_cast(solv); - - const auto sol = conv_solv.GetSolution(ctx, problem); + const auto sol = FindSolution(solv, params, ctx, problem, invoke_params); ASSERT_TRUE(sol.Succeeded()); ASSERT_TRUE(sol.invoker_factory); const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); @@ -417,7 +487,7 @@ void RunSolverBwd(const miopen::solver::conv::ConvSolverBase& solv, //********************************** auto ref_in = tensor{input.desc}; - if(use_cpu_ref) + if(params.use_cpu_ref) { cpu_convolution_backward_data(conv_desc.GetSpatialDimension(), ref_in, @@ -439,22 +509,22 @@ void RunSolverBwd(const miopen::solver::conv::ConvSolverBase& solv, } template -void RunSolverBwd(const miopen::solver::conv::ConvSolverBase& solv, +void RunSolverBwd(const miopen::solver::conv::ConvSolverInterface& solv, + const UnitTestConvSolverParams& params, const ConvTestCase& conv_config, - miopenConvAlgorithm_t algo, - bool use_cpu_ref) + miopenConvAlgorithm_t algo) { - RunSolverBwd(solv, conv_config, algo, use_cpu_ref); + RunSolverBwd(solv, params, conv_config, algo); } //********************************** // Wrw //********************************** template -void RunSolverWrw(const miopen::solver::conv::ConvSolverBase& solv, +void RunSolverWrw(const miopen::solver::conv::ConvSolverInterface& solv, + const UnitTestConvSolverParams& params, const ConvTestCase& conv_config, - miopenConvAlgorithm_t algo, - bool use_cpu_ref) + miopenConvAlgorithm_t algo) { //********************************** // Prepare @@ -517,10 +587,7 @@ void RunSolverWrw(const miopen::solver::conv::ConvSolverBase& solv, const auto invoke_params = miopen::conv::WrWInvokeParams{ tensors, wspace.ptr(), wspace.size(), conv_desc.attribute.gfx90aFp16alt.GetWrW()}; - // \todo add path for tunable solvers - const auto& conv_solv = dynamic_cast(solv); - - const auto sol = conv_solv.GetSolution(ctx, problem); + const auto sol = FindSolution(solv, params, ctx, problem, invoke_params); ASSERT_TRUE(sol.Succeeded()); ASSERT_TRUE(sol.invoker_factory); const auto invoker = handle.PrepareInvoker(*sol.invoker_factory, sol.construction_params); @@ -532,7 +599,7 @@ void RunSolverWrw(const miopen::solver::conv::ConvSolverBase& solv, //********************************** auto ref_weights = tensor{weights.desc}; - if(use_cpu_ref) + if(params.use_cpu_ref) { cpu_convolution_backward_weight(conv_desc.GetSpatialDimension(), input, @@ -554,32 +621,32 @@ void RunSolverWrw(const miopen::solver::conv::ConvSolverBase& solv, } template -void RunSolverWrw(const miopen::solver::conv::ConvSolverBase& solv, +void RunSolverWrw(const miopen::solver::conv::ConvSolverInterface& solv, + const UnitTestConvSolverParams& params, const ConvTestCase& conv_config, - miopenConvAlgorithm_t algo, - bool use_cpu_ref) + miopenConvAlgorithm_t algo) { - RunSolverWrw(solv, conv_config, algo, use_cpu_ref); + RunSolverWrw(solv, params, conv_config, algo); } template -void RunSolver(const miopen::solver::conv::ConvSolverBase& solver, +void RunSolver(const miopen::solver::conv::ConvSolverInterface& solver, + const UnitTestConvSolverParams& params, miopen::conv::Direction direction, const ConvTestCase& conv_config, - miopenConvAlgorithm_t algo, - bool use_cpu_ref) + miopenConvAlgorithm_t algo) { // clang-format off switch(direction) { case miopen::conv::Direction::Forward: - RunSolverFwd(solver, conv_config, algo, use_cpu_ref); + RunSolverFwd(solver, params, conv_config, algo); return; case miopen::conv::Direction::BackwardData: - RunSolverBwd(solver, conv_config, algo, use_cpu_ref); + RunSolverBwd(solver, params, conv_config, algo); return; case miopen::conv::Direction::BackwardWeights: - RunSolverWrw(solver, conv_config, algo, use_cpu_ref); + RunSolverWrw(solver, params, conv_config, algo); return; default: throw std::runtime_error("unknown direction"); @@ -587,11 +654,11 @@ void RunSolver(const miopen::solver::conv::ConvSolverBase& solver, // clang-format on } -void RunSolver(const miopen::solver::conv::ConvSolverBase& solver, +void RunSolver(const miopen::solver::conv::ConvSolverInterface& solver, + const UnitTestConvSolverParams& params, miopen::conv::Direction direction, const ConvTestCase& conv_config, - miopenConvAlgorithm_t algo, - bool use_cpu_ref) + miopenConvAlgorithm_t algo) { if(conv_config.GetXDataType() == conv_config.GetWDataType() && conv_config.GetWDataType() == conv_config.GetYDataType()) @@ -600,13 +667,13 @@ void RunSolver(const miopen::solver::conv::ConvSolverBase& solver, switch(conv_config.GetXDataType()) { case miopenHalf: - RunSolver(solver, direction, conv_config, algo, use_cpu_ref); + RunSolver(solver, params, direction, conv_config, algo); return; case miopenFloat: - RunSolver(solver, direction, conv_config, algo, use_cpu_ref); + RunSolver(solver, params, direction, conv_config, algo); return; case miopenBFloat16: - RunSolver(solver, direction, conv_config, algo, use_cpu_ref); + RunSolver(solver, params, direction, conv_config, algo); return; default: throw std::runtime_error("handling of this data type is not yet implemented"); @@ -617,7 +684,7 @@ void RunSolver(const miopen::solver::conv::ConvSolverBase& solver, conv_config.GetXDataType() == miopenInt8 && conv_config.GetWDataType() == miopenInt8 && conv_config.GetYDataType() == miopenInt32) { - RunSolverFwd(solver, conv_config, algo, use_cpu_ref); + RunSolverFwd(solver, params, conv_config, algo); return; } @@ -626,21 +693,27 @@ void RunSolver(const miopen::solver::conv::ConvSolverBase& solver, } // namespace -void UnitTestConvSolverBase::SetUpImpl(Gpu supported_devs) +void UnitTestConvSolverBase::SetUpImpl(const UnitTestConvSolverParams& params) { - if(!IsTestSupportedByDevice(supported_devs)) + if(!IsTestSupportedByDevice(params.supported_devs)) { GTEST_SKIP(); } } -void UnitTestConvSolverBase::RunTestImpl(const miopen::solver::conv::ConvSolverBase& solver, +void UnitTestConvSolverBase::RunTestImpl(const miopen::solver::conv::ConvSolverInterface& solver, + const UnitTestConvSolverParams& params, miopen::conv::Direction direction, const ConvTestCase& conv_config, - miopenConvAlgorithm_t algo, - bool use_cpu_ref) + miopenConvAlgorithm_t algo) { - RunSolver(solver, direction, conv_config, algo, use_cpu_ref); + DeprecatedSolversScopedEnabler deprecated_solv_enabler; + if(params.enable_deprecated_solvers) + { + deprecated_solv_enabler.Enable(); + } + + RunSolver(solver, params, direction, conv_config, algo); } //************************************************************************************ @@ -648,17 +721,23 @@ void UnitTestConvSolverBase::RunTestImpl(const miopen::solver::conv::ConvSolverB //************************************************************************************ void UnitTestConvSolverDevApplicabilityBase::RunTestImpl( - const miopen::solver::conv::ConvSolverBase& solver, - Gpu supported_devs, + const miopen::solver::conv::ConvSolverInterface& solver, + const UnitTestConvSolverParams& params, miopen::conv::Direction direction, const ConvTestCase& conv_config) { + DeprecatedSolversScopedEnabler deprecated_solv_enabler; + if(params.enable_deprecated_solvers) + { + deprecated_solv_enabler.Enable(); + } + const auto problem = GetProblemDescription(direction, conv_config); const auto all_known_devs = GetAllKnownDevices(); for(const auto& [dev, dev_descr] : all_known_devs) { - const auto supported = IsDeviceSupported(supported_devs, dev); + const auto supported = IsDeviceSupported(params.supported_devs, dev); // std::cout << "Test " << dev_descr << " (supported: " << supported << ")" << std::endl; auto handle = MockHandle{dev_descr}; diff --git a/test/gtest/unit_conv_solver.hpp b/test/gtest/unit_conv_solver.hpp index b93423482e..8ab181906e 100644 --- a/test/gtest/unit_conv_solver.hpp +++ b/test/gtest/unit_conv_solver.hpp @@ -85,39 +85,57 @@ struct ConvTestCase // Unit test for convolution solver //************************************************************************************ +struct UnitTestConvSolverParams +{ + UnitTestConvSolverParams(); + UnitTestConvSolverParams(Gpu supported_devs); + + void UseCpuRef(); + void EnableDeprecatedSolvers(); + void Tunable(std::size_t iterations_max); + + Gpu supported_devs; + bool use_cpu_ref; + bool enable_deprecated_solvers; + bool tunable; + std::size_t tuning_iterations_max; +}; + class UnitTestConvSolverBase { public: - void RunTestImpl(const miopen::solver::conv::ConvSolverBase& solver, + void RunTestImpl(const miopen::solver::conv::ConvSolverInterface& solver, + const UnitTestConvSolverParams& params, miopen::conv::Direction direction, const ConvTestCase& conv_config, - miopenConvAlgorithm_t algo, - bool use_cpu_ref); + miopenConvAlgorithm_t algo); protected: - void SetUpImpl(Gpu supported_devs); + void SetUpImpl(const UnitTestConvSolverParams& params); }; template class UnitTestConvSolver : public UnitTestConvSolverBase, - public ::testing::TestWithParam> + public ::testing::TestWithParam< + std::tuple> { public: - void RunTest(const miopen::solver::conv::ConvSolverBase& solver, bool use_cpu_ref = false) + void RunTest(const miopen::solver::conv::ConvSolverInterface& solver) { + UnitTestConvSolverParams params; miopenConvAlgorithm_t algo; ConvTestCase conv_config; - std::tie(std::ignore, algo, conv_config) = GetParam(); - this->RunTestImpl(solver, direction, conv_config, algo, use_cpu_ref); + std::tie(params, algo, conv_config) = GetParam(); + this->RunTestImpl(solver, params, direction, conv_config, algo); } protected: void SetUp() override { - Gpu supported_devs; - std::tie(supported_devs, std::ignore, std::ignore) = GetParam(); - this->SetUpImpl(supported_devs); + UnitTestConvSolverParams params; + std::tie(params, std::ignore, std::ignore) = GetParam(); + this->SetUpImpl(params); } }; @@ -132,8 +150,8 @@ using UnitTestConvSolverWrw = UnitTestConvSolver class UnitTestConvSolverDevApplicability : public UnitTestConvSolverDevApplicabilityBase, - public ::testing::TestWithParam> + public ::testing::TestWithParam> { public: - void RunTest(const miopen::solver::conv::ConvSolverBase& solver) + void RunTest(const miopen::solver::conv::ConvSolverInterface& solver) { - Gpu supported_devs; + UnitTestConvSolverParams params; ConvTestCase conv_config; - std::tie(supported_devs, conv_config) = GetParam(); - this->RunTestImpl(solver, supported_devs, direction, conv_config); + std::tie(params, conv_config) = GetParam(); + this->RunTestImpl(solver, params, direction, conv_config); } }; diff --git a/test/gtest/unit_conv_solver_ConvDirectNaiveConvBwd.cpp b/test/gtest/unit_conv_solver_ConvDirectNaiveConvBwd.cpp index e98c6b2acc..073b67be51 100644 --- a/test/gtest/unit_conv_solver_ConvDirectNaiveConvBwd.cpp +++ b/test/gtest/unit_conv_solver_ConvDirectNaiveConvBwd.cpp @@ -56,23 +56,31 @@ auto GetConvTestCasesFull(miopenDataType_t datatype) return cases; } -Gpu GetSupportedDevices() { return Gpu::All; } +const auto& GetTestParams() +{ + static const auto params = [] { + auto p = miopen::unit_tests::UnitTestConvSolverParams(Gpu::All); + p.UseCpuRef(); // CPU verification + return p; + }(); + return params; +} } // namespace TEST_P(GPU_UnitTestConvSolverBwd_FP16, ConvDirectNaiveConvBwd) { - this->RunTest(miopen::solver::conv::ConvDirectNaiveConvBwd{}, true); // CPU verification + this->RunTest(miopen::solver::conv::ConvDirectNaiveConvBwd{}); }; TEST_P(GPU_UnitTestConvSolverBwd_BFP16, ConvDirectNaiveConvBwd) { - this->RunTest(miopen::solver::conv::ConvDirectNaiveConvBwd{}, true); // CPU verification + this->RunTest(miopen::solver::conv::ConvDirectNaiveConvBwd{}); }; TEST_P(GPU_UnitTestConvSolverBwd_FP32, ConvDirectNaiveConvBwd) { - this->RunTest(miopen::solver::conv::ConvDirectNaiveConvBwd{}, true); // CPU verification + this->RunTest(miopen::solver::conv::ConvDirectNaiveConvBwd{}); }; TEST_P(CPU_UnitTestConvSolverDevApplicabilityBwd_NONE, ConvDirectNaiveConvBwd) @@ -83,31 +91,31 @@ TEST_P(CPU_UnitTestConvSolverDevApplicabilityBwd_NONE, ConvDirectNaiveConvBwd) // Smoke tests INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverBwd_FP16, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCases(miopenHalf)))); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverBwd_BFP16, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCases(miopenBFloat16)))); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverBwd_FP32, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCases(miopenFloat)))); // Device applicability test INSTANTIATE_TEST_SUITE_P(Smoke, CPU_UnitTestConvSolverDevApplicabilityBwd_NONE, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(GetConvTestCases(miopenFloat)[0]))); // Full tests INSTANTIATE_TEST_SUITE_P(Full, GPU_UnitTestConvSolverBwd_FP16, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCasesFull(miopenHalf)))); diff --git a/test/gtest/unit_conv_solver_ConvDirectNaiveConvFwd.cpp b/test/gtest/unit_conv_solver_ConvDirectNaiveConvFwd.cpp index 4b87ee6d90..a9253e945b 100644 --- a/test/gtest/unit_conv_solver_ConvDirectNaiveConvFwd.cpp +++ b/test/gtest/unit_conv_solver_ConvDirectNaiveConvFwd.cpp @@ -72,28 +72,36 @@ auto GetConvTestCasesFull(miopenDataType_t datatype) return cases; } -Gpu GetSupportedDevices() { return Gpu::All; } +const auto& GetTestParams() +{ + static const auto params = [] { + auto p = miopen::unit_tests::UnitTestConvSolverParams(Gpu::All); + p.UseCpuRef(); // CPU verification + return p; + }(); + return params; +} } // namespace TEST_P(GPU_UnitTestConvSolverFwd_I8, ConvDirectNaiveConvFwd) { - this->RunTest(miopen::solver::conv::ConvDirectNaiveConvFwd{}, true); // CPU verification + this->RunTest(miopen::solver::conv::ConvDirectNaiveConvFwd{}); }; TEST_P(GPU_UnitTestConvSolverFwd_FP16, ConvDirectNaiveConvFwd) { - this->RunTest(miopen::solver::conv::ConvDirectNaiveConvFwd{}, true); // CPU verification + this->RunTest(miopen::solver::conv::ConvDirectNaiveConvFwd{}); }; TEST_P(GPU_UnitTestConvSolverFwd_BFP16, ConvDirectNaiveConvFwd) { - this->RunTest(miopen::solver::conv::ConvDirectNaiveConvFwd{}, true); // CPU verification + this->RunTest(miopen::solver::conv::ConvDirectNaiveConvFwd{}); }; TEST_P(GPU_UnitTestConvSolverFwd_FP32, ConvDirectNaiveConvFwd) { - this->RunTest(miopen::solver::conv::ConvDirectNaiveConvFwd{}, true); // CPU verification + this->RunTest(miopen::solver::conv::ConvDirectNaiveConvFwd{}); }; TEST_P(CPU_UnitTestConvSolverDevApplicabilityFwd_NONE, ConvDirectNaiveConvFwd) @@ -104,49 +112,49 @@ TEST_P(CPU_UnitTestConvSolverDevApplicabilityFwd_NONE, ConvDirectNaiveConvFwd) // Smoke tests INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverFwd_I8, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCases(miopenInt8)))); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverFwd_FP16, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCases(miopenHalf)))); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverFwd_BFP16, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCases(miopenBFloat16)))); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverFwd_FP32, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCases(miopenFloat)))); // Device applicability test INSTANTIATE_TEST_SUITE_P(Smoke, CPU_UnitTestConvSolverDevApplicabilityFwd_NONE, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(GetConvTestCases(miopenFloat)[0]))); // Full tests INSTANTIATE_TEST_SUITE_P(Full, GPU_UnitTestConvSolverFwd_I8, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCasesFull(miopenInt8)))); INSTANTIATE_TEST_SUITE_P(Full, GPU_UnitTestConvSolverFwd_FP16, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCasesFull(miopenHalf)))); INSTANTIATE_TEST_SUITE_P(Full, GPU_UnitTestConvSolverFwd_FP32, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCasesFull(miopenFloat)))); diff --git a/test/gtest/unit_conv_solver_ConvDirectNaiveConvWrw.cpp b/test/gtest/unit_conv_solver_ConvDirectNaiveConvWrw.cpp index 6772003a1e..0d99720194 100644 --- a/test/gtest/unit_conv_solver_ConvDirectNaiveConvWrw.cpp +++ b/test/gtest/unit_conv_solver_ConvDirectNaiveConvWrw.cpp @@ -39,23 +39,31 @@ auto GetConvTestCases(miopenDataType_t datatype) }; } -Gpu GetSupportedDevices() { return Gpu::All; } +const auto& GetTestParams() +{ + static const auto params = [] { + auto p = miopen::unit_tests::UnitTestConvSolverParams(Gpu::All); + p.UseCpuRef(); // CPU verification + return p; + }(); + return params; +} } // namespace TEST_P(GPU_UnitTestConvSolverWrw_FP16, ConvDirectNaiveConvWrw) { - this->RunTest(miopen::solver::conv::ConvDirectNaiveConvWrw{}, true); // CPU verification + this->RunTest(miopen::solver::conv::ConvDirectNaiveConvWrw{}); }; TEST_P(GPU_UnitTestConvSolverWrw_BFP16, ConvDirectNaiveConvWrw) { - this->RunTest(miopen::solver::conv::ConvDirectNaiveConvWrw{}, true); // CPU verification + this->RunTest(miopen::solver::conv::ConvDirectNaiveConvWrw{}); }; TEST_P(GPU_UnitTestConvSolverWrw_FP32, ConvDirectNaiveConvWrw) { - this->RunTest(miopen::solver::conv::ConvDirectNaiveConvWrw{}, true); // CPU verification + this->RunTest(miopen::solver::conv::ConvDirectNaiveConvWrw{}); }; TEST_P(CPU_UnitTestConvSolverDevApplicabilityWrw_NONE, ConvDirectNaiveConvWrw) @@ -66,24 +74,24 @@ TEST_P(CPU_UnitTestConvSolverDevApplicabilityWrw_NONE, ConvDirectNaiveConvWrw) // Smoke tests INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverWrw_FP16, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCases(miopenHalf)))); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverWrw_BFP16, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCases(miopenBFloat16)))); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_UnitTestConvSolverWrw_FP32, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(miopenConvolutionAlgoDirect), testing::ValuesIn(GetConvTestCases(miopenFloat)))); // Device applicability test INSTANTIATE_TEST_SUITE_P(Smoke, CPU_UnitTestConvSolverDevApplicabilityWrw_NONE, - testing::Combine(testing::Values(GetSupportedDevices()), + testing::Combine(testing::Values(GetTestParams()), testing::Values(GetConvTestCases(miopenFloat)[0]))); diff --git a/test/gtest/unit_conv_solver_ConvOclBwdWrW2.hpp b/test/gtest/unit_conv_solver_ConvOclBwdWrW2.hpp new file mode 100644 index 0000000000..24ca91df95 --- /dev/null +++ b/test/gtest/unit_conv_solver_ConvOclBwdWrW2.hpp @@ -0,0 +1,108 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2024 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "unit_conv_solver.hpp" + +#ifndef NUM_BATCH_LOOPS +#error "NUM_BATCH_LOOPS undefined" +#endif + +#define MAKE_SOLVER_NAME1(name, suffix) name##suffix +#define MAKE_SOLVER_NAME(name, suffix) MAKE_SOLVER_NAME1(name, suffix) + +#define SOLVER_NAME MAKE_SOLVER_NAME(ConvOclBwdWrW2N, NUM_BATCH_LOOPS) + +namespace { + +auto GetConvTestCases(miopenDataType_t datatype) +{ + using TestCase = miopen::unit_tests::ConvTestCase; + const std::size_t N = NUM_BATCH_LOOPS; + + return std::vector{ + // clang-format off + TestCase{{N, 1, 16, 16}, {1, 1, 4, 4}, {3, 3}, {1, 1}, {1, 1}, datatype}, + // clang-format on + }; +} + +const auto& GetTestParams() +{ + static const auto params = [] { + auto p = miopen::unit_tests::UnitTestConvSolverParams(Gpu::All); + p.EnableDeprecatedSolvers(); + p.Tunable(5); + return p; + }(); + return params; +} + +} // namespace + +TEST_P(GPU_UnitTestConvSolverWrw_FP16, SOLVER_NAME) +{ + this->RunTest(miopen::solver::conv::ConvOclBwdWrW2{}); +}; + +TEST_P(GPU_UnitTestConvSolverWrw_BFP16, SOLVER_NAME) +{ + this->RunTest(miopen::solver::conv::ConvOclBwdWrW2{}); +}; + +TEST_P(GPU_UnitTestConvSolverWrw_FP32, SOLVER_NAME) +{ + this->RunTest(miopen::solver::conv::ConvOclBwdWrW2{}); +}; + +TEST_P(CPU_UnitTestConvSolverDevApplicabilityWrw_NONE, SOLVER_NAME) +{ + this->RunTest(miopen::solver::conv::ConvOclBwdWrW2{}); +}; + +// Smoke tests +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_UnitTestConvSolverWrw_FP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenConvolutionAlgoDirect), + testing::ValuesIn(GetConvTestCases(miopenHalf)))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_UnitTestConvSolverWrw_BFP16, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenConvolutionAlgoDirect), + testing::ValuesIn(GetConvTestCases(miopenBFloat16)))); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_UnitTestConvSolverWrw_FP32, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(miopenConvolutionAlgoDirect), + testing::ValuesIn(GetConvTestCases(miopenFloat)))); + +// Device applicability test +INSTANTIATE_TEST_SUITE_P(Smoke, + CPU_UnitTestConvSolverDevApplicabilityWrw_NONE, + testing::Combine(testing::Values(GetTestParams()), + testing::Values(GetConvTestCases(miopenFloat)[0]))); diff --git a/src/include/miopen/type_traits.hpp b/test/gtest/unit_conv_solver_ConvOclBwdWrW2N1.cpp similarity index 62% rename from src/include/miopen/type_traits.hpp rename to test/gtest/unit_conv_solver_ConvOclBwdWrW2N1.cpp index c343c2a8e6..6b4b1de9e2 100644 --- a/src/include/miopen/type_traits.hpp +++ b/test/gtest/unit_conv_solver_ConvOclBwdWrW2N1.cpp @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2022 Advanced Micro Devices, Inc. + * Copyright (c) 2024 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -24,34 +24,5 @@ * *******************************************************************************/ -#pragma once - -#include - -namespace miopen { - -namespace detail { - -template -using void_t = void; - -template class Op, class... Args> -struct MemberDetector -{ - using value_t = std::false_type; - using type = Default; -}; - -template class Op, class... Args> -struct MemberDetector>, Op, Args...> -{ - using value_t = std::true_type; - using type = Op; -}; - -} // namespace detail - -template