forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use 2d weight and bias texture for conv2d quantized op (pytorch#114902)
Summary: The performance with 2D texture for weight and bias is better for quantized conv2d, the un-quantized version of conv2d also uses 2D texture. The performance gain is: With 3D: Kernel Name Workgroup Size Duration P50 (ns) =========== ============== ================= vulkan.quantized_conv2d {96, 72, 2} 5965440 vulkan.quantized_conv2d {96, 72, 2} 11316968 vulkan.quantized_conv2d_dw{96, 72, 2} 2735564 vulkan.quantized_conv2d_pw_2x2{96, 72, 2} 1645696 With 2D: vulkan.quantized_conv2d {96, 72, 2} 4295772 vulkan.quantized_conv2d {96, 72, 2} 7874620 vulkan.quantized_conv2d_dw{96, 72, 2} 2658552 vulkan.quantized_conv2d_pw_2x2{96, 72, 2} 1632020 Test Plan: Ensure all vulkan quantize tests pass: buck2 run --target-platforms ovr_configplatform/macos:arm64-fbsourcexplat/caffe2:pt_vulkan_quantized_api_test_binAppleMac\#macosx-arm64 -c pt.vulkan_full_precision=1 --show-output" Running main() from third-party/googletest/1.11.0/googletest/googletest/src/gtest_main.cc [==========] Running 78 tests from 1 test suite. [----------] Global test environment set-up. [----------] 78 tests from VulkanAPITest .... [----------] 78 tests from VulkanAPITest (1519 ms total) [----------] Global test environment tear-down [==========] 78 tests from 1 test suite ran. (1519 ms total) [ PASSED ] 78 tests. buck2 run --target-platforms ovr_config//platform/macos:arm64-fbsource //xplat/caffe2:pt_vulkan_api_test_binAppleMac\#macosx-arm64 -c pt.vulkan_full_precision=1 --show-output" Running main() from third-party/googletest/1.11.0/googletest/googletest/src/gtest_main.cc [==========] Running 395 tests from 1 test suite. [----------] Global test environment set-up. [----------] 395 tests from VulkanAPITest ...... ----------] 395 tests from VulkanAPITest (6515 ms total) [----------] Global test environment tear-down [==========] 395 tests from 1 test suite ran. (6515 ms total) [ PASSED ] 394 tests. [ SKIPPED ] 1 test, listed below: [ SKIPPED ] VulkanAPITest.querypool_flushed_shader_log YOU HAVE 5 DISABLED TESTS Reviewed By: yipjustin Differential Revision: D50997534 Pull Request resolved: pytorch#114902 Approved by: https://github.com/yipjustin
- Loading branch information
1 parent
6317a03
commit 8dbae73
Showing
8 changed files
with
237 additions
and
22 deletions.
There are no files selected for viewing
53 changes: 53 additions & 0 deletions
53
aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_int32.glsl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
#version 450 core | ||
#define PRECISION $precision | ||
#define FORMAT $format | ||
|
||
layout(std430) buffer; | ||
|
||
/* | ||
* Output Image | ||
*/ | ||
layout(set = 0, binding = 0, rgba32i) uniform PRECISION restrict writeonly iimage2D uImage; | ||
|
||
/* | ||
* Input Buffer | ||
*/ | ||
layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { | ||
int data[]; | ||
} | ||
uBuffer; | ||
|
||
/* | ||
* Params Buffer | ||
*/ | ||
layout(set = 0, binding = 2) uniform PRECISION restrict Block { | ||
// xyz contain the extents of the output texture, w contains HxW to help | ||
// calculate buffer offsets | ||
ivec4 out_extents; | ||
} | ||
uBlock; | ||
|
||
/* | ||
* Local Work Group Size | ||
*/ | ||
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; | ||
|
||
void main() { | ||
const ivec3 pos = ivec3(gl_GlobalInvocationID); | ||
|
||
if (any(greaterThanEqual(pos, uBlock.out_extents.xyz))) { | ||
return; | ||
} | ||
|
||
const int base_index = | ||
pos.x + uBlock.out_extents.x * pos.y + (4 * uBlock.out_extents.w) * pos.z; | ||
const ivec4 buf_indices = | ||
base_index + ivec4(0, 1, 2, 3) * uBlock.out_extents.w; | ||
|
||
int val_x = uBuffer.data[buf_indices.x]; | ||
int val_y = uBuffer.data[buf_indices.y]; | ||
int val_z = uBuffer.data[buf_indices.z]; | ||
int val_w = uBuffer.data[buf_indices.w]; | ||
|
||
imageStore(uImage, pos.xy, ivec4(val_x, val_y, val_z, val_w)); | ||
} |
81 changes: 81 additions & 0 deletions
81
aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_int8.glsl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
#version 450 core | ||
#define PRECISION $precision | ||
#define FORMAT $format | ||
|
||
layout(std430) buffer; | ||
|
||
/* | ||
* Output Image | ||
*/ | ||
layout(set = 0, binding = 0, rgba8i) uniform PRECISION restrict writeonly iimage2D uImage; | ||
|
||
/* | ||
* Input Buffer | ||
*/ | ||
layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { | ||
int data[]; | ||
} | ||
uBuffer; | ||
|
||
/* | ||
* Extends sign of int8 | ||
*/ | ||
int extend_sign(int x) { | ||
if (x >> 7 == 1) { | ||
return x | 0xFFFFFF00; | ||
} | ||
return x; | ||
} | ||
|
||
/* | ||
* Params Buffer | ||
*/ | ||
layout(set = 0, binding = 2) uniform PRECISION restrict Block { | ||
// xyz contain the extents of the output texture, w contains HxW to help | ||
// calculate buffer offsets | ||
ivec4 out_extents; | ||
} | ||
uBlock; | ||
|
||
/* | ||
* Local Work Group Size | ||
*/ | ||
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; | ||
|
||
void main() { | ||
const ivec3 pos = ivec3(gl_GlobalInvocationID); | ||
|
||
if (any(greaterThanEqual(pos, uBlock.out_extents.xyz))) { | ||
return; | ||
} | ||
|
||
const int base_index = | ||
pos.x + uBlock.out_extents.x * pos.y + (4 * uBlock.out_extents.w) * pos.z; | ||
const ivec4 buf_indices = | ||
base_index + ivec4(0, 1, 2, 3) * uBlock.out_extents.w; | ||
|
||
int shift = (1 << 8) - 1; | ||
ivec4 masks; | ||
masks.x = shift << 8 * (buf_indices.x % 4); | ||
masks.y = shift << 8 * (buf_indices.y % 4); | ||
masks.z = shift << 8 * (buf_indices.z % 4); | ||
masks.w = shift << 8 * (buf_indices.w % 4); | ||
|
||
int buf_in_1 = uBuffer.data[buf_indices.x / 4]; | ||
int val_x = (buf_in_1 & masks.x) >> 8 * (buf_indices.x % 4); | ||
val_x = extend_sign(val_x); | ||
|
||
int buf_in_2 = uBuffer.data[buf_indices.y / 4]; | ||
int val_y = (buf_in_2 & masks.y) >> 8 * (buf_indices.y % 4); | ||
val_y = extend_sign(val_y); | ||
|
||
int buf_in_3 = uBuffer.data[buf_indices.z / 4]; | ||
int val_z = (buf_in_3 & masks.z) >> 8 * (buf_indices.z % 4); | ||
val_z = extend_sign(val_z); | ||
|
||
int buf_in_4 = uBuffer.data[buf_indices.w / 4]; | ||
int val_w = (buf_in_4 & masks.w) >> 8 * (buf_indices.w % 4); | ||
val_w = extend_sign(val_w); | ||
|
||
imageStore(uImage, pos.xy, ivec4(val_x, val_y, val_z, val_w)); | ||
} |
67 changes: 67 additions & 0 deletions
67
aten/src/ATen/native/vulkan/glsl/nchw_to_image2d_uint8.glsl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#version 450 core | ||
#define PRECISION $precision | ||
#define FORMAT $format | ||
|
||
layout(std430) buffer; | ||
|
||
/* | ||
* Output Image | ||
*/ | ||
layout(set = 0, binding = 0, rgba8ui) uniform PRECISION restrict writeonly uimage2D uImage; | ||
|
||
/* | ||
* Input Buffer | ||
*/ | ||
layout(set = 0, binding = 1) buffer PRECISION restrict readonly Buffer { | ||
uint data[]; | ||
} | ||
uBuffer; | ||
|
||
/* | ||
* Params Buffer | ||
*/ | ||
layout(set = 0, binding = 2) uniform PRECISION restrict Block { | ||
// xyz contain the extents of the output texture, w contains HxW to help | ||
// calculate buffer offsets | ||
ivec4 out_extents; | ||
} | ||
uBlock; | ||
|
||
/* | ||
* Local Work Group Size | ||
*/ | ||
layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; | ||
|
||
void main() { | ||
const ivec3 pos = ivec3(gl_GlobalInvocationID); | ||
|
||
if (any(greaterThanEqual(pos, uBlock.out_extents.xyz))) { | ||
return; | ||
} | ||
|
||
const int base_index = | ||
pos.x + uBlock.out_extents.x * pos.y + (4 * uBlock.out_extents.w) * pos.z; | ||
const ivec4 buf_indices = | ||
base_index + ivec4(0, 1, 2, 3) * uBlock.out_extents.w; | ||
|
||
int shift = (1 << 8) - 1; | ||
ivec4 masks; | ||
masks.x = shift << 8 * (buf_indices.x % 4); | ||
masks.y = shift << 8 * (buf_indices.y % 4); | ||
masks.z = shift << 8 * (buf_indices.z % 4); | ||
masks.w = shift << 8 * (buf_indices.w % 4); | ||
|
||
uint buf_in_1 = uBuffer.data[buf_indices.x / 4]; | ||
uint a_v = (buf_in_1 & masks.x) >> 8 * (buf_indices.x % 4); | ||
|
||
uint buf_in_2 = uBuffer.data[buf_indices.y / 4]; | ||
uint b_v = (buf_in_2 & masks.y) >> 8 * (buf_indices.y % 4); | ||
|
||
uint buf_in_3 = uBuffer.data[buf_indices.z / 4]; | ||
uint g_v = (buf_in_3 & masks.z) >> 8 * (buf_indices.z % 4); | ||
|
||
uint buf_in_4 = uBuffer.data[buf_indices.w / 4]; | ||
uint r_v = (buf_in_4 & masks.w) >> 8 * (buf_indices.w % 4); | ||
|
||
imageStore(uImage, pos.xy, uvec4(a_v, b_v, g_v, r_v)); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters