diff options
| author | Joshua Ashton <joshua@froggi.es> | 2021-11-10 11:41:39 +0000 | 
|---|---|---|
| committer | Simon Zeni <simon@bl4ckb0ne.ca> | 2021-11-23 15:46:24 +0000 | 
| commit | f132d66816263a509b46092e14755b5e3f0e258f (patch) | |
| tree | 91facdf51ce12a57a1f40aab0765e1953f7f6ad2 | |
| parent | 5332935afcdd519dd2dd9dbfeb50c6aa0f91e9c3 (diff) | |
| download | wlroots-f132d66816263a509b46092e14755b5e3f0e258f.tar.xz | |
render/vulkan: Optimize vertex shader
This ends up being a horrible global load:
  s_getpc_b64   s[4:5]                                  // 000000000000: BE841C80
  v_add_u32     v0, s2, v0                              // 000000000004: 68000002
  v_sub_co_u32  v1, vcc, 0, v0                          // 000000000008: 34020080
  v_max_i32     v1, v0, v1                              // 00000000000C: 1A020300
  v_and_b32     v1, 3, v1                               // 000000000010: 26020283
  v_cmp_lt_i32  s[0:1], v0, 0                           // 000000000014: D0C10000 00010100
  v_sub_co_u32  v0, vcc, 0, v1                          // 00000000001C: 34000280
  v_cndmask_b32  v0, v1, v0, s[0:1]                     // 000000000020: D1000000 00020101
  v_lshlrev_b32  v1, 3, v0                              // 000000000028: 24020083
  v_mad_u32_u24  v0, v0, 8, 4                           // 00000000002C: D1C30000 02111100
  v_min_u32     v1, 32, v1                              // 000000000034: 1C0202A0
  v_min_u32     v0, 32, v0                              // 000000000038: 1C0000A0
  s_getpc_b64   s[0:1]                                  // 00000000003C: BE801C00
  s_add_u32     s0, s0, 0x0000003c                      // 000000000040: 8000FF00 0000003C
  s_addc_u32    s1, s1, 0                               // 000000000048: 82018001
  global_load_dword  v1, v[1:2], s[0:1]                 // 00000000004C: DC508000 01000001
  global_load_dword  v0, v[0:1], s[0:1]                 // 000000000054: DC508000 00000000
  v_mov_b32     v2, 0                                   // 00000000005C: 7E040280
  v_mov_b32     v3, 1.0                                 // 000000000060: 7E0602F2
  s_waitcnt     vmcnt(0)                                // 000000000064: BF8C0F70
  exp           pos0, v1, v0, v2, v3 done               // 000000000068: C40008CF 03020001
  exp           param0, off, off, off, off              // 000000000070: C4000200 00000000
  s_endpgm                                              // 000000000078: BF810000
  v_cndmask_b32  v0, s0, v0, vcc                        // 00000000007C: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                        // 000000000080: 00000000
  v_add_f16     v192, s0, v0                            // 000000000084: 3F800000
  v_cndmask_b32  v0, s0, v0, vcc                        // 000000000088: 00000000
  v_add_f16     v192, s0, v0                            // 00000000008C: 3F800000
  v_add_f16     v192, s0, v0                            // 000000000090: 3F800000
  v_cndmask_b32  v0, s0, v0, vcc                        // 000000000094: 00000000
  v_add_f16     v192, s0, v0                            // 000000000098: 3F800000
  v_cndmask_b32  v0, s0, v0, vcc                        // 00000000009C: 00000000
With some bit magic, we can get something much nicer:
  v_add_u32     v0, s2, v0                              // 000000000000: 68000002
  v_add_u32     v1, 1, v0                               // 000000000004: 68020081
  v_and_b32     v1, 2, v1                               // 000000000008: 26020282
  v_cvt_f32_i32  v1, v1                                 // 00000000000C: 7E020B01
  v_mul_f32     v1, 0.5, v1                             // 000000000010: 0A0202F0
  v_and_b32     v0, 2, v0                               // 000000000014: 26000082
  v_cvt_f32_i32  v0, v0                                 // 000000000018: 7E000B00
  v_mul_f32     v0, 0.5, v0                             // 00000000001C: 0A0000F0
  v_mov_b32     v2, 0                                   // 000000000020: 7E040280
  v_mov_b32     v3, 1.0                                 // 000000000024: 7E0602F2
  exp           pos0, v1, v0, v2, v3 done               // 000000000028: C40008CF 03020001
  exp           param0, off, off, off, off              // 000000000030: C4000200 00000000
  s_endpgm                                              // 000000000038: BF810000
The above output was based on just shoving it in ShaderPlayground -- I was not able to use pipeline feedback as I was unable to get RenderDoc working due to the EXT_physical_device_drm requirement.
I additionally considered using >> 1 instead of * 0.5, but AMD has dedicated modifiers to merge a * 0.5, * 2.0, etc in a single instruction. (Albeit, not taken advantage of in the code above, but might with ACO)
Signed-off-by: Joshua Ashton <joshua@froggi.es>
| -rw-r--r-- | render/vulkan/shaders/common.vert | 11 | 
1 files changed, 2 insertions, 9 deletions
| diff --git a/render/vulkan/shaders/common.vert b/render/vulkan/shaders/common.vert index fa31d26c..c6175d24 100644 --- a/render/vulkan/shaders/common.vert +++ b/render/vulkan/shaders/common.vert @@ -10,16 +10,9 @@ layout(push_constant, row_major) uniform UBO {  layout(location = 0) out vec2 uv; -// 4 outlining points and uv coords -const vec2[] values = { -	{0, 0}, -	{1, 0}, -	{1, 1}, -	{0, 1}, -}; -  void main() { -	vec2 pos = values[gl_VertexIndex % 4]; +	vec2 pos = vec2(float((gl_VertexIndex + 1) & 2) * 0.5f, +		float(gl_VertexIndex & 2) * 0.5f);  	uv = data.uv_offset + pos * data.uv_size;  	gl_Position = data.proj * vec4(pos, 0.0, 1.0);  } | 
