From 8dbae73e624e7fd99e27c9ba0a6e9837e7224b45 Mon Sep 17 00:00:00 2001 From: Shubhraprakash Das Date: Mon, 4 Dec 2023 20:54:40 +0000 Subject: [PATCH] Use 2d weight and bias texture for conv2d quantized op (#114902) Summary: The performance with 2D texture for weight and bias is better for quantized conv2d, the un-quantized version of conv2d also uses 2D texture. The performance gain is: With 3D: Kernel Name Workgroup Size Duration P50 (ns) =========== ============== ================= vulkan.quantized_conv2d {96, 72, 2} 5965440 vulkan.quantized_conv2d {96, 72, 2} 11316968 vulkan.quantized_conv2d_dw{96, 72, 2} 2735564 vulkan.quantized_conv2d_pw_2x2{96, 72, 2} 1645696 With 2D: vulkan.quantized_conv2d {96, 72, 2} 4295772 vulkan.quantized_conv2d {96, 72, 2} 7874620 vulkan.quantized_conv2d_dw{96, 72, 2} 2658552 vulkan.quantized_conv2d_pw_2x2{96, 72, 2} 1632020 Test Plan: Ensure all vulkan quantize tests pass: buck2 run --target-platforms ovr_configplatform/macos:arm64-fbsourcexplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64 -c pt.vulkan_full_precision=1 --show-output" Running main() from third-party/googletest/1.11.0/googletest/googletest/src/gtest_main.cc [==========] Running 78 tests from 1 test suite. [----------] Global test environment set-up. [----------] 78 tests from VulkanAPITest .... [----------] 78 tests from VulkanAPITest (1519 ms total) [----------] Global test environment tear-down [==========] 78 tests from 1 test suite ran. (1519 ms total) [ PASSED ] 78 tests. buck2 run --target-platforms ovr_config//platform/macos:arm64-fbsource //xplat/caffe2:pt_vulkan_api_test_binAppleMac\#macosx-arm64 -c pt.vulkan_full_precision=1 --show-output" Running main() from third-party/googletest/1.11.0/googletest/googletest/src/gtest_main.cc [==========] Running 395 tests from 1 test suite. [----------] Global test environment set-up. [----------] 395 tests from VulkanAPITest ...... ----------] 395 tests from VulkanAPITest (6515 ms total) [----------] Global test environment tear-down [==========] 395 tests from 1 test suite ran. (6515 ms total) [ PASSED ] 394 tests. [ SKIPPED ] 1 test, listed below: [ SKIPPED ] VulkanAPITest.querypool_flushed_shader_log YOU HAVE 5 DISABLED TESTS Reviewed By: yipjustin Differential Revision: D50997534 Pull Request resolved: https://github.com/pytorch/pytorch/pull/114902 Approved by: https://github.com/yipjustin --- .../vulkan/glsl/nchw_to_image2d_int32.glsl | 53 ++++++++++++ .../vulkan/glsl/nchw_to_image2d_int8.glsl | 81 +++++++++++++++++++ .../vulkan/glsl/nchw_to_image2d_uint8.glsl | 67 +++++++++++++++ .../native/vulkan/glsl/quantized_conv2d.glsl | 14 ++-- .../vulkan/glsl/quantized_conv2d_dw.glsl | 8 +- .../vulkan/glsl/quantized_conv2d_pw_2x2.glsl | 14 ++-- aten/src/ATen/native/vulkan/impl/Packing.cpp | 14 ++++ .../ATen/native/vulkan/ops/Convolution.cpp | 8 +- 8 files changed, 237 insertions(+), 22 deletions(-) create mode 100644 aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_int32.glsl create mode 100644 aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_int8.glsl create mode 100644 aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_uint8.glsl diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_int32.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_int32.glsl new file mode 100644 index 00000000000000..e9d7d268017a27 --- /dev/null +++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_int32.glsl @@ -0,0 +1,53 @@ +#version 450 core +#define PRECISION $precision +#define FORMAT $format + +layout(std430) buffer; + +/* + * Output Image + */ +layout(set = 0, binding = 0, rgba32i) uniform PRECISION restrict writeonly iimage2D uImage; + +/* + * Input Buffer + */ +layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { + int data[]; +} +uBuffer; + +/* + * Params Buffer + */ +layout(set = 0, binding = 2) uniform PRECISION restrict Block { + // xyz contain the extents of the output texture, w contains HxW to help + // calculate buffer offsets + ivec4 out_extents; +} +uBlock; + +/* + * Local Work Group Size + */ +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(pos, uBlock.out_extents.xyz))) { + return; + } + + const int base_index = + pos.x + uBlock.out_extents.x * pos.y + (4 * uBlock.out_extents.w) * pos.z; + const ivec4 buf_indices = + base_index + ivec4(0, 1, 2, 3) * uBlock.out_extents.w; + + int val_x = uBuffer.data[buf_indices.x]; + int val_y = uBuffer.data[buf_indices.y]; + int val_z = uBuffer.data[buf_indices.z]; + int val_w = uBuffer.data[buf_indices.w]; + + imageStore(uImage, pos.xy, ivec4(val_x, val_y, val_z, val_w)); +} diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_int8.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_int8.glsl new file mode 100644 index 00000000000000..87568dde98e759 --- /dev/null +++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_int8.glsl @@ -0,0 +1,81 @@ +#version 450 core +#define PRECISION $precision +#define FORMAT $format + +layout(std430) buffer; + +/* + * Output Image + */ +layout(set = 0, binding = 0, rgba8i) uniform PRECISION restrict writeonly iimage2D uImage; + +/* + * Input Buffer + */ +layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { + int data[]; +} +uBuffer; + +/* + * Extends sign of int8 + */ +int extend_sign(int x) { + if (x >> 7 == 1) { + return x | 0xFFFFFF00; + } + return x; +} + +/* + * Params Buffer + */ +layout(set = 0, binding = 2) uniform PRECISION restrict Block { + // xyz contain the extents of the output texture, w contains HxW to help + // calculate buffer offsets + ivec4 out_extents; +} +uBlock; + +/* + * Local Work Group Size + */ +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(pos, uBlock.out_extents.xyz))) { + return; + } + + const int base_index = + pos.x + uBlock.out_extents.x * pos.y + (4 * uBlock.out_extents.w) * pos.z; + const ivec4 buf_indices = + base_index + ivec4(0, 1, 2, 3) * uBlock.out_extents.w; + + int shift = (1 << 8) - 1; + ivec4 masks; + masks.x = shift << 8 * (buf_indices.x % 4); + masks.y = shift << 8 * (buf_indices.y % 4); + masks.z = shift << 8 * (buf_indices.z % 4); + masks.w = shift << 8 * (buf_indices.w % 4); + + int buf_in_1 = uBuffer.data[buf_indices.x / 4]; + int val_x = (buf_in_1 & masks.x) >> 8 * (buf_indices.x % 4); + val_x = extend_sign(val_x); + + int buf_in_2 = uBuffer.data[buf_indices.y / 4]; + int val_y = (buf_in_2 & masks.y) >> 8 * (buf_indices.y % 4); + val_y = extend_sign(val_y); + + int buf_in_3 = uBuffer.data[buf_indices.z / 4]; + int val_z = (buf_in_3 & masks.z) >> 8 * (buf_indices.z % 4); + val_z = extend_sign(val_z); + + int buf_in_4 = uBuffer.data[buf_indices.w / 4]; + int val_w = (buf_in_4 & masks.w) >> 8 * (buf_indices.w % 4); + val_w = extend_sign(val_w); + + imageStore(uImage, pos.xy, ivec4(val_x, val_y, val_z, val_w)); +} diff --git a/aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_uint8.glsl b/aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_uint8.glsl new file mode 100644 index 00000000000000..4cc241fa80621f --- /dev/null +++ b/aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_uint8.glsl @@ -0,0 +1,67 @@ +#version 450 core +#define PRECISION $precision +#define FORMAT $format + +layout(std430) buffer; + +/* + * Output Image + */ +layout(set = 0, binding = 0, rgba8ui) uniform PRECISION restrict writeonly uimage2D uImage; + +/* + * Input Buffer + */ +layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { + uint data[]; +} +uBuffer; + +/* + * Params Buffer + */ +layout(set = 0, binding = 2) uniform PRECISION restrict Block { + // xyz contain the extents of the output texture, w contains HxW to help + // calculate buffer offsets + ivec4 out_extents; +} +uBlock; + +/* + * Local Work Group Size + */ +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +void main() { + const ivec3 pos = ivec3(gl_GlobalInvocationID); + + if (any(greaterThanEqual(pos, uBlock.out_extents.xyz))) { + return; + } + + const int base_index = + pos.x + uBlock.out_extents.x * pos.y + (4 * uBlock.out_extents.w) * pos.z; + const ivec4 buf_indices = + base_index + ivec4(0, 1, 2, 3) * uBlock.out_extents.w; + + int shift = (1 << 8) - 1; + ivec4 masks; + masks.x = shift << 8 * (buf_indices.x % 4); + masks.y = shift << 8 * (buf_indices.y % 4); + masks.z = shift << 8 * (buf_indices.z % 4); + masks.w = shift << 8 * (buf_indices.w % 4); + + uint buf_in_1 = uBuffer.data[buf_indices.x / 4]; + uint a_v = (buf_in_1 & masks.x) >> 8 * (buf_indices.x % 4); + + uint buf_in_2 = uBuffer.data[buf_indices.y / 4]; + uint b_v = (buf_in_2 & masks.y) >> 8 * (buf_indices.y % 4); + + uint buf_in_3 = uBuffer.data[buf_indices.z / 4]; + uint g_v = (buf_in_3 & masks.z) >> 8 * (buf_indices.z % 4); + + uint buf_in_4 = uBuffer.data[buf_indices.w / 4]; + uint r_v = (buf_in_4 & masks.w) >> 8 * (buf_indices.w % 4); + + imageStore(uImage, pos.xy, uvec4(a_v, b_v, g_v, r_v)); +} diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl index b1e8f2b2123ede..f2915cfbe7ed21 100644 --- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl +++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d.glsl @@ -21,8 +21,8 @@ layout(set = 0, binding = 0, rgba8ui) uniform PRECISION restrict writeonly uimag * Input Textures */ layout(set = 0, binding = 1) uniform PRECISION isampler3D uInput; -layout(set = 0, binding = 2) uniform PRECISION isampler3D uKernel; -layout(set = 0, binding = 3) uniform PRECISION isampler3D uBias; +layout(set = 0, binding = 2) uniform PRECISION isampler2D uKernel; +layout(set = 0, binding = 3) uniform PRECISION isampler2D uBias; /* * Params Buffer @@ -103,7 +103,7 @@ void main() { kstart.y += pos.z * uBlock.kernel_size.y; vec4 sum = dequantize( - texelFetch(uBias, ivec3(pos.z, 0, 0), 0), + texelFetch(uBias, ivec2(pos.z, 0), 0), uBlock.scales.w, uBlock.zero_points.w); @@ -153,25 +153,25 @@ void main() { // which is what is expressed in the following calculations. const vec4 ktex_0 = dequantize( - texelFetch(uKernel, ivec3(kx + 0, ky, 0), 0), + texelFetch(uKernel, ivec2(kx + 0, ky), 0), uBlock.scales.z, uBlock.zero_points.z); sum = fma(in_tex.xxxx, ktex_0, sum); const vec4 ktex_1 = dequantize( - texelFetch(uKernel, ivec3(kx + 1, ky, 0), 0), + texelFetch(uKernel, ivec2(kx + 1, ky), 0), uBlock.scales.z, uBlock.zero_points.z); sum = fma(in_tex.yyyy, ktex_1, sum); const vec4 ktex_2 = dequantize( - texelFetch(uKernel, ivec3(kx + 2, ky, 0), 0), + texelFetch(uKernel, ivec2(kx + 2, ky), 0), uBlock.scales.z, uBlock.zero_points.z); sum = fma(in_tex.zzzz, ktex_2, sum); const vec4 ktex_3 = dequantize( - texelFetch(uKernel, ivec3(kx + 3, ky, 0), 0), + texelFetch(uKernel, ivec2(kx + 3, ky), 0), uBlock.scales.z, uBlock.zero_points.z); sum = fma(in_tex.wwww, ktex_3, sum); diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl index 0d823620a517f9..817ba4fbb40335 100644 --- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_dw.glsl @@ -22,8 +22,8 @@ layout(set = 0, binding = 0, rgba8ui) uniform PRECISION restrict writeonly uimag * Input Textures */ layout(set = 0, binding = 1) uniform PRECISION isampler3D uInput; -layout(set = 0, binding = 2) uniform PRECISION isampler3D uKernel; -layout(set = 0, binding = 3) uniform PRECISION isampler3D uBias; +layout(set = 0, binding = 2) uniform PRECISION isampler2D uKernel; +layout(set = 0, binding = 3) uniform PRECISION isampler2D uBias; /* * Params Buffer @@ -90,7 +90,7 @@ void main() { const ivec2 kstart = (start - ipos) / uBlock.dilate; vec4 sum = dequantize( - texelFetch(uBias, ivec3(pos.z, 0, 0), 0), + texelFetch(uBias, ivec2(pos.z, 0), 0), uBlock.scales.w, uBlock.zero_points.w); @@ -104,7 +104,7 @@ void main() { const int k_ind = kx + ky * uBlock.kernel_size.x; const vec4 k_tex = dequantize( - texelFetch(uKernel, ivec3(k_ind, pos.z, 0), 0), + texelFetch(uKernel, ivec2(k_ind, pos.z), 0), uBlock.scales.z, uBlock.zero_points.z); const vec4 in_tex = dequantize( diff --git a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl index 6d63cb1330b505..890086a08a2374 100644 --- a/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl +++ b/aten/src/ATen/native/vulkan/glsl/quantized_conv2d_pw_2x2.glsl @@ -17,8 +17,8 @@ layout(set = 0, binding = 0, rgba8ui) uniform PRECISION restrict writeonly uimag * Input Textures */ layout(set = 0, binding = 1) uniform PRECISION isampler3D uInput; -layout(set = 0, binding = 2) uniform PRECISION isampler3D uKernel; -layout(set = 0, binding = 3) uniform PRECISION isampler3D uBias; +layout(set = 0, binding = 2) uniform PRECISION isampler2D uKernel; +layout(set = 0, binding = 3) uniform PRECISION isampler2D uBias; /* * Params Buffer @@ -100,7 +100,7 @@ void main() { vec4 sum[4]; sum[0] = dequantize( - texelFetch(uBias, ivec3(gpos.z, 0, 0), 0), + texelFetch(uBias, ivec2(gpos.z, 0), 0), uBlock.scales.w, uBlock.zero_points.w); for (int i = 1; i < 4; ++i) { @@ -114,19 +114,19 @@ void main() { // channel (IC) dim is along the x axis, and the batch (OC) dim is along // the z axis. const vec4 ktex_0 = dequantize( - texelFetch(uKernel, ivec3(z + 0, gpos.z, 0), 0), + texelFetch(uKernel, ivec2(z + 0, gpos.z), 0), uBlock.scales.z, uBlock.zero_points.z); const vec4 ktex_1 = dequantize( - texelFetch(uKernel, ivec3(z + 1, gpos.z, 0), 0), + texelFetch(uKernel, ivec2(z + 1, gpos.z), 0), uBlock.scales.z, uBlock.zero_points.z); const vec4 ktex_2 = dequantize( - texelFetch(uKernel, ivec3(z + 2, gpos.z, 0), 0), + texelFetch(uKernel, ivec2(z + 2, gpos.z), 0), uBlock.scales.z, uBlock.zero_points.z); const vec4 ktex_3 = dequantize( - texelFetch(uKernel, ivec3(z + 3, gpos.z, 0), 0), + texelFetch(uKernel, ivec2(z + 3, gpos.z), 0), uBlock.scales.z, uBlock.zero_points.z); diff --git a/aten/src/ATen/native/vulkan/impl/Packing.cpp b/aten/src/ATen/native/vulkan/impl/Packing.cpp index 7d95969ed07c68..b43ca4ecd5f327 100644 --- a/aten/src/ATen/native/vulkan/impl/Packing.cpp +++ b/aten/src/ATen/native/vulkan/impl/Packing.cpp @@ -24,6 +24,20 @@ api::ShaderInfo get_nchw_to_image_shader(const vTensor& v_dst) { "Vulkan quantization currently not supported for dtype ", v_dst.dtype()); } + case api::StorageType::TEXTURE_2D: + switch (v_dst.dtype()) { + case c10::ScalarType::QUInt8: + return VK_KERNEL(nchw_to_image2d_uint8); + case c10::ScalarType::QInt8: + return VK_KERNEL(nchw_to_image2d_int8); + case c10::ScalarType::QInt32: + return VK_KERNEL(nchw_to_image2d_int32); + default: + TORCH_CHECK( + false, + "Vulkan quantization currently not supported for dtype ", + v_dst.dtype()); + } default: TORCH_CHECK(false, "No kernel available!"); case api::StorageType::BUFFER: diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp index c60be10e25fd80..d278a321796567 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp +++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp @@ -1,8 +1,8 @@ -#include -#include #include +#include +#include #include #include #include @@ -529,7 +529,7 @@ vTensor pack_weights( api::context(), weight_rearranged.sizes(), weight_arg.scalar_type(), - quantized ? api::StorageType::TEXTURE_3D : api::StorageType::TEXTURE_2D, + api::StorageType::TEXTURE_2D, }; if (quantized) { @@ -557,7 +557,7 @@ vTensor pack_biases( api::context(), bias_rearranged.sizes(), bias_rearranged.scalar_type(), - quantized ? api::StorageType::TEXTURE_3D : api::StorageType::TEXTURE_2D, + api::StorageType::TEXTURE_2D, }; if (quantized) {