Skip to content

Commit 4bbcbda

Browse files
committed
[AMDGPU] Unify divergent nodes if the PostDom tree has one root
This patch allows AMDGPUUnifyDivergenceExitNodes pass to transform a function whose PDT has exactly one root and ends in a branch instruction. Fixes #58861. Reviewed By: ruiling, arsenm Differential Revision: https://reviews.llvm.org/D139780
1 parent e9c8242 commit 4bbcbda

13 files changed

+397
-193
lines changed

llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,9 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
187187
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
188188

189189
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
190-
191-
// If there's only one exit, we don't need to do anything.
192-
if (PDT.root_size() <= 1)
190+
if (PDT.root_size() == 0 ||
191+
(PDT.root_size() == 1 &&
192+
!isa<BranchInst>(PDT.getRoot()->getTerminator())))
193193
return false;
194194

195195
LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 89 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -567,68 +567,70 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
567567
; GFX908-NEXT: s_add_i32 s1, s9, s1
568568
; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
569569
; GFX908-NEXT: s_branch .LBB3_2
570-
; GFX908-NEXT: .LBB3_1: ; %bb12
570+
; GFX908-NEXT: .LBB3_1: ; %Flow20
571571
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
572-
; GFX908-NEXT: s_add_u32 s6, s6, s4
573-
; GFX908-NEXT: s_addc_u32 s7, s7, 0
574-
; GFX908-NEXT: s_add_u32 s10, s10, s12
575-
; GFX908-NEXT: s_addc_u32 s11, s11, s13
576-
; GFX908-NEXT: .LBB3_2: ; %bb9
572+
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15]
573+
; GFX908-NEXT: s_cbranch_vccz .LBB3_12
574+
; GFX908-NEXT: .LBB3_2: ; %bb9
577575
; GFX908-NEXT: ; =>This Loop Header: Depth=1
578576
; GFX908-NEXT: ; Child Loop BB3_5 Depth 2
579-
; GFX908-NEXT: s_cbranch_scc0 .LBB3_1
580-
; GFX908-NEXT: ; %bb.3: ; %bb14
577+
; GFX908-NEXT: s_mov_b64 s[16:17], -1
578+
; GFX908-NEXT: s_cbranch_scc0 .LBB3_10
579+
; GFX908-NEXT: ; %bb.3: ; %bb14
581580
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
582581
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
583582
; GFX908-NEXT: s_mov_b32 s9, s8
584583
; GFX908-NEXT: v_mov_b32_e32 v4, s8
585-
; GFX908-NEXT: v_mov_b32_e32 v6, s8
586584
; GFX908-NEXT: v_mov_b32_e32 v8, s8
585+
; GFX908-NEXT: v_mov_b32_e32 v6, s8
587586
; GFX908-NEXT: v_mov_b32_e32 v5, s9
588-
; GFX908-NEXT: v_mov_b32_e32 v7, s9
589587
; GFX908-NEXT: v_mov_b32_e32 v9, s9
588+
; GFX908-NEXT: v_mov_b32_e32 v7, s9
590589
; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
590+
; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1
591591
; GFX908-NEXT: v_mov_b32_e32 v11, v5
592-
; GFX908-NEXT: s_mov_b64 s[16:17], s[10:11]
592+
; GFX908-NEXT: s_mov_b64 s[20:21], s[10:11]
593593
; GFX908-NEXT: v_mov_b32_e32 v10, v4
594594
; GFX908-NEXT: s_waitcnt vmcnt(0)
595595
; GFX908-NEXT: v_readfirstlane_b32 s5, v2
596596
; GFX908-NEXT: v_readfirstlane_b32 s9, v3
597597
; GFX908-NEXT: s_add_u32 s5, s5, 1
598598
; GFX908-NEXT: s_addc_u32 s9, s9, 0
599599
; GFX908-NEXT: s_mul_hi_u32 s19, s2, s5
600-
; GFX908-NEXT: s_mul_i32 s20, s3, s5
600+
; GFX908-NEXT: s_mul_i32 s22, s3, s5
601601
; GFX908-NEXT: s_mul_i32 s18, s2, s5
602602
; GFX908-NEXT: s_mul_i32 s5, s2, s9
603603
; GFX908-NEXT: s_add_i32 s5, s19, s5
604-
; GFX908-NEXT: s_add_i32 s5, s5, s20
604+
; GFX908-NEXT: s_add_i32 s5, s5, s22
605605
; GFX908-NEXT: s_branch .LBB3_5
606606
; GFX908-NEXT: .LBB3_4: ; %bb58
607607
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
608608
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
609609
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
610-
; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
611-
; GFX908-NEXT: s_add_u32 s16, s16, s0
612-
; GFX908-NEXT: s_addc_u32 s17, s17, s1
613-
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
610+
; GFX908-NEXT: s_add_u32 s20, s20, s0
611+
; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
612+
; GFX908-NEXT: s_addc_u32 s21, s21, s1
613+
; GFX908-NEXT: s_mov_b64 s[22:23], 0
614+
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25]
615+
; GFX908-NEXT: s_cbranch_vccz .LBB3_9
614616
; GFX908-NEXT: .LBB3_5: ; %bb16
615617
; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
616618
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
617-
; GFX908-NEXT: s_add_u32 s20, s16, s18
618-
; GFX908-NEXT: s_addc_u32 s21, s17, s5
619-
; GFX908-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc
619+
; GFX908-NEXT: s_add_u32 s22, s20, s18
620+
; GFX908-NEXT: s_addc_u32 s23, s21, s5
621+
; GFX908-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc
620622
; GFX908-NEXT: s_waitcnt vmcnt(0)
621-
; GFX908-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc
623+
; GFX908-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc
622624
; GFX908-NEXT: s_waitcnt vmcnt(0)
623-
; GFX908-NEXT: global_load_dword v12, v19, s[20:21] offset:-4 glc
625+
; GFX908-NEXT: global_load_dword v12, v19, s[22:23] offset:-4 glc
624626
; GFX908-NEXT: s_waitcnt vmcnt(0)
625-
; GFX908-NEXT: global_load_dword v12, v19, s[20:21] glc
627+
; GFX908-NEXT: global_load_dword v12, v19, s[22:23] glc
626628
; GFX908-NEXT: s_waitcnt vmcnt(0)
627629
; GFX908-NEXT: ds_read_b64 v[12:13], v19
628630
; GFX908-NEXT: ds_read_b64 v[14:15], v0
629-
; GFX908-NEXT: s_and_b64 vcc, exec, s[14:15]
631+
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17]
630632
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
631-
; GFX908-NEXT: s_cbranch_vccnz .LBB3_4
633+
; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
632634
; GFX908-NEXT: ; %bb.6: ; %bb51
633635
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
634636
; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -645,12 +647,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
645647
; GFX908-NEXT: v_add_f32_e32 v12, v20, v12
646648
; GFX908-NEXT: v_add_f32_e32 v5, v5, v25
647649
; GFX908-NEXT: v_add_f32_e32 v4, v4, v24
648-
; GFX908-NEXT: v_add_f32_e32 v7, v7, v27
649-
; GFX908-NEXT: v_add_f32_e32 v6, v6, v26
650-
; GFX908-NEXT: v_add_f32_e32 v8, v8, v14
651-
; GFX908-NEXT: v_add_f32_e32 v9, v9, v15
650+
; GFX908-NEXT: v_add_f32_e32 v9, v9, v27
651+
; GFX908-NEXT: v_add_f32_e32 v8, v8, v26
652+
; GFX908-NEXT: v_add_f32_e32 v6, v6, v14
653+
; GFX908-NEXT: v_add_f32_e32 v7, v7, v15
652654
; GFX908-NEXT: v_add_f32_e32 v10, v10, v12
653655
; GFX908-NEXT: v_add_f32_e32 v11, v11, v13
656+
; GFX908-NEXT: s_mov_b64 s[22:23], -1
654657
; GFX908-NEXT: s_branch .LBB3_4
655658
;
656659
; GFX90A-LABEL: introduced_copy_to_sgpr:
@@ -700,65 +703,67 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
700703
; GFX90A-NEXT: s_add_i32 s1, s9, s1
701704
; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
702705
; GFX90A-NEXT: s_branch .LBB3_2
703-
; GFX90A-NEXT: .LBB3_1: ; %bb12
706+
; GFX90A-NEXT: .LBB3_1: ; %Flow20
704707
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
705-
; GFX90A-NEXT: s_add_u32 s6, s6, s4
706-
; GFX90A-NEXT: s_addc_u32 s7, s7, 0
707-
; GFX90A-NEXT: s_add_u32 s10, s10, s12
708-
; GFX90A-NEXT: s_addc_u32 s11, s11, s13
708+
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15]
709+
; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
709710
; GFX90A-NEXT: .LBB3_2: ; %bb9
710711
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
711712
; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2
712-
; GFX90A-NEXT: s_cbranch_scc0 .LBB3_1
713+
; GFX90A-NEXT: s_mov_b64 s[16:17], -1
714+
; GFX90A-NEXT: s_cbranch_scc0 .LBB3_10
713715
; GFX90A-NEXT: ; %bb.3: ; %bb14
714716
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
715717
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
716718
; GFX90A-NEXT: s_mov_b32 s9, s8
717719
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
718-
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
719720
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
721+
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
720722
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
721-
; GFX90A-NEXT: s_mov_b64 s[16:17], s[10:11]
723+
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1
724+
; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11]
722725
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
723726
; GFX90A-NEXT: s_waitcnt vmcnt(0)
724727
; GFX90A-NEXT: v_readfirstlane_b32 s5, v4
725728
; GFX90A-NEXT: v_readfirstlane_b32 s9, v5
726729
; GFX90A-NEXT: s_add_u32 s5, s5, 1
727730
; GFX90A-NEXT: s_addc_u32 s9, s9, 0
728731
; GFX90A-NEXT: s_mul_hi_u32 s19, s2, s5
729-
; GFX90A-NEXT: s_mul_i32 s20, s3, s5
732+
; GFX90A-NEXT: s_mul_i32 s22, s3, s5
730733
; GFX90A-NEXT: s_mul_i32 s18, s2, s5
731734
; GFX90A-NEXT: s_mul_i32 s5, s2, s9
732735
; GFX90A-NEXT: s_add_i32 s5, s19, s5
733-
; GFX90A-NEXT: s_add_i32 s5, s5, s20
736+
; GFX90A-NEXT: s_add_i32 s5, s5, s22
734737
; GFX90A-NEXT: s_branch .LBB3_5
735738
; GFX90A-NEXT: .LBB3_4: ; %bb58
736739
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
737740
; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
738741
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
739-
; GFX90A-NEXT: s_add_u32 s16, s16, s0
740-
; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
741-
; GFX90A-NEXT: s_addc_u32 s17, s17, s1
742-
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
742+
; GFX90A-NEXT: s_add_u32 s20, s20, s0
743+
; GFX90A-NEXT: s_addc_u32 s21, s21, s1
744+
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
745+
; GFX90A-NEXT: s_mov_b64 s[22:23], 0
746+
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25]
747+
; GFX90A-NEXT: s_cbranch_vccz .LBB3_9
743748
; GFX90A-NEXT: .LBB3_5: ; %bb16
744749
; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1
745750
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
746-
; GFX90A-NEXT: s_add_u32 s20, s16, s18
747-
; GFX90A-NEXT: s_addc_u32 s21, s17, s5
748-
; GFX90A-NEXT: global_load_dword v21, v19, s[20:21] offset:-12 glc
751+
; GFX90A-NEXT: s_add_u32 s22, s20, s18
752+
; GFX90A-NEXT: s_addc_u32 s23, s21, s5
753+
; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc
749754
; GFX90A-NEXT: s_waitcnt vmcnt(0)
750-
; GFX90A-NEXT: global_load_dword v20, v19, s[20:21] offset:-8 glc
755+
; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc
751756
; GFX90A-NEXT: s_waitcnt vmcnt(0)
752-
; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] offset:-4 glc
757+
; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:-4 glc
753758
; GFX90A-NEXT: s_waitcnt vmcnt(0)
754-
; GFX90A-NEXT: global_load_dword v14, v19, s[20:21] glc
759+
; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] glc
755760
; GFX90A-NEXT: s_waitcnt vmcnt(0)
756761
; GFX90A-NEXT: ds_read_b64 v[14:15], v19
757762
; GFX90A-NEXT: ds_read_b64 v[16:17], v0
758-
; GFX90A-NEXT: s_and_b64 vcc, exec, s[14:15]
759-
; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21
763+
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17]
764+
; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23
760765
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
761-
; GFX90A-NEXT: s_cbranch_vccnz .LBB3_4
766+
; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7
762767
; GFX90A-NEXT: ; %bb.6: ; %bb51
763768
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
764769
; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -770,10 +775,40 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
770775
; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17]
771776
; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15]
772777
; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25]
773-
; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[26:27]
774-
; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17]
778+
; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[26:27]
779+
; GFX90A-NEXT: v_pk_add_f32 v[8:9], v[8:9], v[16:17]
775780
; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15]
781+
; GFX90A-NEXT: s_mov_b64 s[22:23], -1
776782
; GFX90A-NEXT: s_branch .LBB3_4
783+
; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
784+
; GFX90A-NEXT: s_mov_b64 s[22:23], s[14:15]
785+
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23]
786+
; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
787+
; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
788+
; GFX90A-NEXT: ; implicit-def: $vgpr12_vgpr13
789+
; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
790+
; GFX90A-NEXT: ; implicit-def: $vgpr10_vgpr11
791+
; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
792+
; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
793+
; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21
794+
; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard
795+
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
796+
; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1
797+
; GFX90A-NEXT: .LBB3_10: ; %Flow19
798+
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
799+
; GFX90A-NEXT: s_mov_b64 s[14:15], -1
800+
; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17]
801+
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
802+
; GFX90A-NEXT: ; %bb.11: ; %bb12
803+
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
804+
; GFX90A-NEXT: s_add_u32 s6, s6, s4
805+
; GFX90A-NEXT: s_addc_u32 s7, s7, 0
806+
; GFX90A-NEXT: s_add_u32 s10, s10, s12
807+
; GFX90A-NEXT: s_addc_u32 s11, s11, s13
808+
; GFX90A-NEXT: s_mov_b64 s[14:15], 0
809+
; GFX90A-NEXT: s_branch .LBB3_1
810+
; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock
811+
; GFX90A-NEXT: s_endpgm
777812
bb:
778813
%i = load volatile i16, ptr addrspace(4) undef, align 2
779814
%i6 = zext i16 %i to i64

0 commit comments

Comments
 (0)