@@ -108,13 +108,14 @@ define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) {
108
108
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
109
109
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
110
110
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
111
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
111
112
; GFX11-NEXT: s_bfe_u32 s1, s0, 0x10010
112
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
113
113
; GFX11-NEXT: s_add_i32 s1, s1, s0
114
114
; GFX11-NEXT: s_bitset1_b32 s0, 22
115
115
; GFX11-NEXT: s_addk_i32 s1, 0x7fff
116
116
; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
117
117
; GFX11-NEXT: s_cselect_b32 s0, s0, s1
118
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
118
119
; GFX11-NEXT: s_lshr_b32 s0, s0, 16
119
120
; GFX11-NEXT: ; return to shader part epilog
120
121
;
@@ -125,6 +126,7 @@ define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) {
125
126
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
126
127
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0
127
128
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
129
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
128
130
; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
129
131
; GFX12-NEXT: s_or_b32 s2, s0, 0x400000
130
132
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -305,10 +307,11 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
305
307
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, s0
306
308
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
307
309
; GFX11-NEXT: v_cmp_u_f32_e64 s1, v0, v0
308
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2 ) | instid1(SALU_CYCLE_1 )
310
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1 ) | instid1(VALU_DEP_2 )
309
311
; GFX11-NEXT: v_readfirstlane_b32 s0, v1
310
312
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
311
313
; GFX11-NEXT: s_bfe_u32 s3, s0, 0x10010
314
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
312
315
; GFX11-NEXT: s_add_i32 s3, s3, s0
313
316
; GFX11-NEXT: s_bitset1_b32 s0, 22
314
317
; GFX11-NEXT: s_addk_i32 s3, 0x7fff
@@ -338,6 +341,7 @@ define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
338
341
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
339
342
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
340
343
; GFX12-NEXT: v_readfirstlane_b32 s0, v1
344
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
341
345
; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
342
346
; GFX12-NEXT: s_or_b32 s3, s0, 0x400000
343
347
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1161,13 +1165,14 @@ define amdgpu_ps i32 @s_sitofp_i1_to_bf16(i1 inreg %num) {
1161
1165
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
1162
1166
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
1163
1167
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
1168
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1164
1169
; GFX11-NEXT: s_bfe_u32 s1, s0, 0x10010
1165
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
1166
1170
; GFX11-NEXT: s_add_i32 s1, s1, s0
1167
1171
; GFX11-NEXT: s_bitset1_b32 s0, 22
1168
1172
; GFX11-NEXT: s_addk_i32 s1, 0x7fff
1169
1173
; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo
1170
1174
; GFX11-NEXT: s_cselect_b32 s0, s0, s1
1175
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1171
1176
; GFX11-NEXT: s_ashr_i32 s0, s0, 16
1172
1177
; GFX11-NEXT: ; return to shader part epilog
1173
1178
;
@@ -1178,6 +1183,7 @@ define amdgpu_ps i32 @s_sitofp_i1_to_bf16(i1 inreg %num) {
1178
1183
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1179
1184
; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0
1180
1185
; GFX12-NEXT: v_readfirstlane_b32 s0, v0
1186
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1181
1187
; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
1182
1188
; GFX12-NEXT: s_or_b32 s2, s0, 0x400000
1183
1189
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1358,10 +1364,11 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
1358
1364
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, s0
1359
1365
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
1360
1366
; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
1361
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2 ) | instid1(SALU_CYCLE_1 )
1367
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1 ) | instid1(VALU_DEP_2 )
1362
1368
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
1363
1369
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
1364
1370
; GFX11-NEXT: s_bfe_u32 s3, s1, 0x10010
1371
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1365
1372
; GFX11-NEXT: s_add_i32 s3, s3, s1
1366
1373
; GFX11-NEXT: s_bitset1_b32 s1, 22
1367
1374
; GFX11-NEXT: s_addk_i32 s3, 0x7fff
@@ -1391,6 +1398,7 @@ define amdgpu_ps <2 x i32> @s_sitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
1391
1398
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1392
1399
; GFX12-NEXT: v_readfirstlane_b32 s2, v0
1393
1400
; GFX12-NEXT: v_readfirstlane_b32 s0, v1
1401
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1394
1402
; GFX12-NEXT: s_bfe_u32 s1, s0, 0x10010
1395
1403
; GFX12-NEXT: s_or_b32 s3, s0, 0x400000
1396
1404
; GFX12-NEXT: s_wait_alu 0xfffe
0 commit comments