From f132d66816263a509b46092e14755b5e3f0e258f Mon Sep 17 00:00:00 2001 From: Joshua Ashton Date: Wed, 10 Nov 2021 11:41:39 +0000 Subject: render/vulkan: Optimize vertex shader This ends up being a horrible global load: s_getpc_b64 s[4:5] // 000000000000: BE841C80 v_add_u32 v0, s2, v0 // 000000000004: 68000002 v_sub_co_u32 v1, vcc, 0, v0 // 000000000008: 34020080 v_max_i32 v1, v0, v1 // 00000000000C: 1A020300 v_and_b32 v1, 3, v1 // 000000000010: 26020283 v_cmp_lt_i32 s[0:1], v0, 0 // 000000000014: D0C10000 00010100 v_sub_co_u32 v0, vcc, 0, v1 // 00000000001C: 34000280 v_cndmask_b32 v0, v1, v0, s[0:1] // 000000000020: D1000000 00020101 v_lshlrev_b32 v1, 3, v0 // 000000000028: 24020083 v_mad_u32_u24 v0, v0, 8, 4 // 00000000002C: D1C30000 02111100 v_min_u32 v1, 32, v1 // 000000000034: 1C0202A0 v_min_u32 v0, 32, v0 // 000000000038: 1C0000A0 s_getpc_b64 s[0:1] // 00000000003C: BE801C00 s_add_u32 s0, s0, 0x0000003c // 000000000040: 8000FF00 0000003C s_addc_u32 s1, s1, 0 // 000000000048: 82018001 global_load_dword v1, v[1:2], s[0:1] // 00000000004C: DC508000 01000001 global_load_dword v0, v[0:1], s[0:1] // 000000000054: DC508000 00000000 v_mov_b32 v2, 0 // 00000000005C: 7E040280 v_mov_b32 v3, 1.0 // 000000000060: 7E0602F2 s_waitcnt vmcnt(0) // 000000000064: BF8C0F70 exp pos0, v1, v0, v2, v3 done // 000000000068: C40008CF 03020001 exp param0, off, off, off, off // 000000000070: C4000200 00000000 s_endpgm // 000000000078: BF810000 v_cndmask_b32 v0, s0, v0, vcc // 00000000007C: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000000000080: 00000000 v_add_f16 v192, s0, v0 // 000000000084: 3F800000 v_cndmask_b32 v0, s0, v0, vcc // 000000000088: 00000000 v_add_f16 v192, s0, v0 // 00000000008C: 3F800000 v_add_f16 v192, s0, v0 // 000000000090: 3F800000 v_cndmask_b32 v0, s0, v0, vcc // 000000000094: 00000000 v_add_f16 v192, s0, v0 // 000000000098: 3F800000 v_cndmask_b32 v0, s0, v0, vcc // 00000000009C: 00000000 With some bit magic, we can get something much nicer: v_add_u32 v0, s2, v0 // 000000000000: 68000002 v_add_u32 v1, 1, v0 // 000000000004: 68020081 v_and_b32 v1, 2, v1 // 000000000008: 26020282 v_cvt_f32_i32 v1, v1 // 00000000000C: 7E020B01 v_mul_f32 v1, 0.5, v1 // 000000000010: 0A0202F0 v_and_b32 v0, 2, v0 // 000000000014: 26000082 v_cvt_f32_i32 v0, v0 // 000000000018: 7E000B00 v_mul_f32 v0, 0.5, v0 // 00000000001C: 0A0000F0 v_mov_b32 v2, 0 // 000000000020: 7E040280 v_mov_b32 v3, 1.0 // 000000000024: 7E0602F2 exp pos0, v1, v0, v2, v3 done // 000000000028: C40008CF 03020001 exp param0, off, off, off, off // 000000000030: C4000200 00000000 s_endpgm // 000000000038: BF810000 The above output was based on just shoving it in ShaderPlayground -- I was not able to use pipeline feedback as I was unable to get RenderDoc working due to the EXT_physical_device_drm requirement. I additionally considered using >> 1 instead of * 0.5, but AMD has dedicated modifiers to merge a * 0.5, * 2.0, etc in a single instruction. (Albeit, not taken advantage of in the code above, but might with ACO) Signed-off-by: Joshua Ashton --- render/vulkan/shaders/common.vert | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'render/vulkan/shaders') diff --git a/render/vulkan/shaders/common.vert b/render/vulkan/shaders/common.vert index fa31d26c..c6175d24 100644 --- a/render/vulkan/shaders/common.vert +++ b/render/vulkan/shaders/common.vert @@ -10,16 +10,9 @@ layout(push_constant, row_major) uniform UBO { layout(location = 0) out vec2 uv; -// 4 outlining points and uv coords -const vec2[] values = { - {0, 0}, - {1, 0}, - {1, 1}, - {0, 1}, -}; - void main() { - vec2 pos = values[gl_VertexIndex % 4]; + vec2 pos = vec2(float((gl_VertexIndex + 1) & 2) * 0.5f, + float(gl_VertexIndex & 2) * 0.5f); uv = data.uv_offset + pos * data.uv_size; gl_Position = data.proj * vec4(pos, 0.0, 1.0); } -- cgit v1.2.3