@@ -567,68 +567,70 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
567
567
; GFX908-NEXT: s_add_i32 s1, s9, s1
568
568
; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
569
569
; GFX908-NEXT: s_branch .LBB3_2
570
- ; GFX908-NEXT: .LBB3_1: ; %bb12
570
+ ; GFX908-NEXT: .LBB3_1: ; %Flow20
571
571
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
572
- ; GFX908-NEXT: s_add_u32 s6, s6, s4
573
- ; GFX908-NEXT: s_addc_u32 s7, s7, 0
574
- ; GFX908-NEXT: s_add_u32 s10, s10, s12
575
- ; GFX908-NEXT: s_addc_u32 s11, s11, s13
576
- ; GFX908-NEXT: .LBB3_2: ; %bb9
572
+ ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15]
573
+ ; GFX908-NEXT: s_cbranch_vccz .LBB3_12
574
+ ; GFX908-NEXT: .LBB3_2: ; %bb9
577
575
; GFX908-NEXT: ; =>This Loop Header: Depth=1
578
576
; GFX908-NEXT: ; Child Loop BB3_5 Depth 2
579
- ; GFX908-NEXT: s_cbranch_scc0 .LBB3_1
580
- ; GFX908-NEXT: ; %bb.3: ; %bb14
577
+ ; GFX908-NEXT: s_mov_b64 s[16:17], -1
578
+ ; GFX908-NEXT: s_cbranch_scc0 .LBB3_10
579
+ ; GFX908-NEXT: ; %bb.3: ; %bb14
581
580
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
582
581
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
583
582
; GFX908-NEXT: s_mov_b32 s9, s8
584
583
; GFX908-NEXT: v_mov_b32_e32 v4, s8
585
- ; GFX908-NEXT: v_mov_b32_e32 v6, s8
586
584
; GFX908-NEXT: v_mov_b32_e32 v8, s8
585
+ ; GFX908-NEXT: v_mov_b32_e32 v6, s8
587
586
; GFX908-NEXT: v_mov_b32_e32 v5, s9
588
- ; GFX908-NEXT: v_mov_b32_e32 v7, s9
589
587
; GFX908-NEXT: v_mov_b32_e32 v9, s9
588
+ ; GFX908-NEXT: v_mov_b32_e32 v7, s9
590
589
; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
590
+ ; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1
591
591
; GFX908-NEXT: v_mov_b32_e32 v11, v5
592
- ; GFX908-NEXT: s_mov_b64 s[16:17 ], s[10:11]
592
+ ; GFX908-NEXT: s_mov_b64 s[20:21 ], s[10:11]
593
593
; GFX908-NEXT: v_mov_b32_e32 v10, v4
594
594
; GFX908-NEXT: s_waitcnt vmcnt(0)
595
595
; GFX908-NEXT: v_readfirstlane_b32 s5, v2
596
596
; GFX908-NEXT: v_readfirstlane_b32 s9, v3
597
597
; GFX908-NEXT: s_add_u32 s5, s5, 1
598
598
; GFX908-NEXT: s_addc_u32 s9, s9, 0
599
599
; GFX908-NEXT: s_mul_hi_u32 s19, s2, s5
600
- ; GFX908-NEXT: s_mul_i32 s20 , s3, s5
600
+ ; GFX908-NEXT: s_mul_i32 s22 , s3, s5
601
601
; GFX908-NEXT: s_mul_i32 s18, s2, s5
602
602
; GFX908-NEXT: s_mul_i32 s5, s2, s9
603
603
; GFX908-NEXT: s_add_i32 s5, s19, s5
604
- ; GFX908-NEXT: s_add_i32 s5, s5, s20
604
+ ; GFX908-NEXT: s_add_i32 s5, s5, s22
605
605
; GFX908-NEXT: s_branch .LBB3_5
606
606
; GFX908-NEXT: .LBB3_4: ; %bb58
607
607
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
608
608
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
609
609
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
610
- ; GFX908-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
611
- ; GFX908-NEXT: s_add_u32 s16, s16, s0
612
- ; GFX908-NEXT: s_addc_u32 s17, s17, s1
613
- ; GFX908-NEXT: s_cbranch_vccz .LBB3_1
610
+ ; GFX908-NEXT: s_add_u32 s20, s20, s0
611
+ ; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
612
+ ; GFX908-NEXT: s_addc_u32 s21, s21, s1
613
+ ; GFX908-NEXT: s_mov_b64 s[22:23], 0
614
+ ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25]
615
+ ; GFX908-NEXT: s_cbranch_vccz .LBB3_9
614
616
; GFX908-NEXT: .LBB3_5: ; %bb16
615
617
; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
616
618
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
617
- ; GFX908-NEXT: s_add_u32 s20, s16 , s18
618
- ; GFX908-NEXT: s_addc_u32 s21, s17 , s5
619
- ; GFX908-NEXT: global_load_dword v21, v19, s[20:21 ] offset:-12 glc
619
+ ; GFX908-NEXT: s_add_u32 s22, s20 , s18
620
+ ; GFX908-NEXT: s_addc_u32 s23, s21 , s5
621
+ ; GFX908-NEXT: global_load_dword v21, v19, s[22:23 ] offset:-12 glc
620
622
; GFX908-NEXT: s_waitcnt vmcnt(0)
621
- ; GFX908-NEXT: global_load_dword v20, v19, s[20:21 ] offset:-8 glc
623
+ ; GFX908-NEXT: global_load_dword v20, v19, s[22:23 ] offset:-8 glc
622
624
; GFX908-NEXT: s_waitcnt vmcnt(0)
623
- ; GFX908-NEXT: global_load_dword v12, v19, s[20:21 ] offset:-4 glc
625
+ ; GFX908-NEXT: global_load_dword v12, v19, s[22:23 ] offset:-4 glc
624
626
; GFX908-NEXT: s_waitcnt vmcnt(0)
625
- ; GFX908-NEXT: global_load_dword v12, v19, s[20:21 ] glc
627
+ ; GFX908-NEXT: global_load_dword v12, v19, s[22:23 ] glc
626
628
; GFX908-NEXT: s_waitcnt vmcnt(0)
627
629
; GFX908-NEXT: ds_read_b64 v[12:13], v19
628
630
; GFX908-NEXT: ds_read_b64 v[14:15], v0
629
- ; GFX908-NEXT: s_and_b64 vcc, exec, s[14:15 ]
631
+ ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17 ]
630
632
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
631
- ; GFX908-NEXT: s_cbranch_vccnz .LBB3_4
633
+ ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
632
634
; GFX908-NEXT: ; %bb.6: ; %bb51
633
635
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
634
636
; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -645,12 +647,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
645
647
; GFX908-NEXT: v_add_f32_e32 v12, v20, v12
646
648
; GFX908-NEXT: v_add_f32_e32 v5, v5, v25
647
649
; GFX908-NEXT: v_add_f32_e32 v4, v4, v24
648
- ; GFX908-NEXT: v_add_f32_e32 v7, v7 , v27
649
- ; GFX908-NEXT: v_add_f32_e32 v6, v6 , v26
650
- ; GFX908-NEXT: v_add_f32_e32 v8, v8 , v14
651
- ; GFX908-NEXT: v_add_f32_e32 v9, v9 , v15
650
+ ; GFX908-NEXT: v_add_f32_e32 v9, v9 , v27
651
+ ; GFX908-NEXT: v_add_f32_e32 v8, v8 , v26
652
+ ; GFX908-NEXT: v_add_f32_e32 v6, v6 , v14
653
+ ; GFX908-NEXT: v_add_f32_e32 v7, v7 , v15
652
654
; GFX908-NEXT: v_add_f32_e32 v10, v10, v12
653
655
; GFX908-NEXT: v_add_f32_e32 v11, v11, v13
656
+ ; GFX908-NEXT: s_mov_b64 s[22:23], -1
654
657
; GFX908-NEXT: s_branch .LBB3_4
655
658
;
656
659
; GFX90A-LABEL: introduced_copy_to_sgpr:
@@ -700,65 +703,67 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
700
703
; GFX90A-NEXT: s_add_i32 s1, s9, s1
701
704
; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
702
705
; GFX90A-NEXT: s_branch .LBB3_2
703
- ; GFX90A-NEXT: .LBB3_1: ; %bb12
706
+ ; GFX90A-NEXT: .LBB3_1: ; %Flow20
704
707
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
705
- ; GFX90A-NEXT: s_add_u32 s6, s6, s4
706
- ; GFX90A-NEXT: s_addc_u32 s7, s7, 0
707
- ; GFX90A-NEXT: s_add_u32 s10, s10, s12
708
- ; GFX90A-NEXT: s_addc_u32 s11, s11, s13
708
+ ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15]
709
+ ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
709
710
; GFX90A-NEXT: .LBB3_2: ; %bb9
710
711
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
711
712
; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2
712
- ; GFX90A-NEXT: s_cbranch_scc0 .LBB3_1
713
+ ; GFX90A-NEXT: s_mov_b64 s[16:17], -1
714
+ ; GFX90A-NEXT: s_cbranch_scc0 .LBB3_10
713
715
; GFX90A-NEXT: ; %bb.3: ; %bb14
714
716
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
715
717
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
716
718
; GFX90A-NEXT: s_mov_b32 s9, s8
717
719
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
718
- ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
719
720
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
721
+ ; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
720
722
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
721
- ; GFX90A-NEXT: s_mov_b64 s[16:17], s[10:11]
723
+ ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1
724
+ ; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11]
722
725
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
723
726
; GFX90A-NEXT: s_waitcnt vmcnt(0)
724
727
; GFX90A-NEXT: v_readfirstlane_b32 s5, v4
725
728
; GFX90A-NEXT: v_readfirstlane_b32 s9, v5
726
729
; GFX90A-NEXT: s_add_u32 s5, s5, 1
727
730
; GFX90A-NEXT: s_addc_u32 s9, s9, 0
728
731
; GFX90A-NEXT: s_mul_hi_u32 s19, s2, s5
729
- ; GFX90A-NEXT: s_mul_i32 s20 , s3, s5
732
+ ; GFX90A-NEXT: s_mul_i32 s22 , s3, s5
730
733
; GFX90A-NEXT: s_mul_i32 s18, s2, s5
731
734
; GFX90A-NEXT: s_mul_i32 s5, s2, s9
732
735
; GFX90A-NEXT: s_add_i32 s5, s19, s5
733
- ; GFX90A-NEXT: s_add_i32 s5, s5, s20
736
+ ; GFX90A-NEXT: s_add_i32 s5, s5, s22
734
737
; GFX90A-NEXT: s_branch .LBB3_5
735
738
; GFX90A-NEXT: .LBB3_4: ; %bb58
736
739
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
737
740
; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
738
741
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
739
- ; GFX90A-NEXT: s_add_u32 s16, s16, s0
740
- ; GFX90A-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[4:5]
741
- ; GFX90A-NEXT: s_addc_u32 s17, s17, s1
742
- ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
742
+ ; GFX90A-NEXT: s_add_u32 s20, s20, s0
743
+ ; GFX90A-NEXT: s_addc_u32 s21, s21, s1
744
+ ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
745
+ ; GFX90A-NEXT: s_mov_b64 s[22:23], 0
746
+ ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25]
747
+ ; GFX90A-NEXT: s_cbranch_vccz .LBB3_9
743
748
; GFX90A-NEXT: .LBB3_5: ; %bb16
744
749
; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1
745
750
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
746
- ; GFX90A-NEXT: s_add_u32 s20, s16 , s18
747
- ; GFX90A-NEXT: s_addc_u32 s21, s17 , s5
748
- ; GFX90A-NEXT: global_load_dword v21, v19, s[20:21 ] offset:-12 glc
751
+ ; GFX90A-NEXT: s_add_u32 s22, s20 , s18
752
+ ; GFX90A-NEXT: s_addc_u32 s23, s21 , s5
753
+ ; GFX90A-NEXT: global_load_dword v21, v19, s[22:23 ] offset:-12 glc
749
754
; GFX90A-NEXT: s_waitcnt vmcnt(0)
750
- ; GFX90A-NEXT: global_load_dword v20, v19, s[20:21 ] offset:-8 glc
755
+ ; GFX90A-NEXT: global_load_dword v20, v19, s[22:23 ] offset:-8 glc
751
756
; GFX90A-NEXT: s_waitcnt vmcnt(0)
752
- ; GFX90A-NEXT: global_load_dword v14, v19, s[20:21 ] offset:-4 glc
757
+ ; GFX90A-NEXT: global_load_dword v14, v19, s[22:23 ] offset:-4 glc
753
758
; GFX90A-NEXT: s_waitcnt vmcnt(0)
754
- ; GFX90A-NEXT: global_load_dword v14, v19, s[20:21 ] glc
759
+ ; GFX90A-NEXT: global_load_dword v14, v19, s[22:23 ] glc
755
760
; GFX90A-NEXT: s_waitcnt vmcnt(0)
756
761
; GFX90A-NEXT: ds_read_b64 v[14:15], v19
757
762
; GFX90A-NEXT: ds_read_b64 v[16:17], v0
758
- ; GFX90A-NEXT: s_and_b64 vcc, exec, s[14:15 ]
759
- ; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21
763
+ ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17 ]
764
+ ; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23
760
765
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
761
- ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_4
766
+ ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7
762
767
; GFX90A-NEXT: ; %bb.6: ; %bb51
763
768
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
764
769
; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -770,10 +775,40 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
770
775
; GFX90A-NEXT: v_pk_add_f32 v[16:17], v[22:23], v[16:17]
771
776
; GFX90A-NEXT: v_pk_add_f32 v[14:15], v[20:21], v[14:15]
772
777
; GFX90A-NEXT: v_pk_add_f32 v[6:7], v[6:7], v[24:25]
773
- ; GFX90A-NEXT: v_pk_add_f32 v[8:9 ], v[8:9 ], v[26:27]
774
- ; GFX90A-NEXT: v_pk_add_f32 v[10:11 ], v[10:11 ], v[16:17]
778
+ ; GFX90A-NEXT: v_pk_add_f32 v[10:11 ], v[10:11 ], v[26:27]
779
+ ; GFX90A-NEXT: v_pk_add_f32 v[8:9 ], v[8:9 ], v[16:17]
775
780
; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15]
781
+ ; GFX90A-NEXT: s_mov_b64 s[22:23], -1
776
782
; GFX90A-NEXT: s_branch .LBB3_4
783
+ ; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
784
+ ; GFX90A-NEXT: s_mov_b64 s[22:23], s[14:15]
785
+ ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23]
786
+ ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
787
+ ; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
788
+ ; GFX90A-NEXT: ; implicit-def: $vgpr12_vgpr13
789
+ ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
790
+ ; GFX90A-NEXT: ; implicit-def: $vgpr10_vgpr11
791
+ ; GFX90A-NEXT: ; implicit-def: $vgpr6_vgpr7
792
+ ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
793
+ ; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21
794
+ ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard
795
+ ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
796
+ ; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1
797
+ ; GFX90A-NEXT: .LBB3_10: ; %Flow19
798
+ ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
799
+ ; GFX90A-NEXT: s_mov_b64 s[14:15], -1
800
+ ; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17]
801
+ ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
802
+ ; GFX90A-NEXT: ; %bb.11: ; %bb12
803
+ ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
804
+ ; GFX90A-NEXT: s_add_u32 s6, s6, s4
805
+ ; GFX90A-NEXT: s_addc_u32 s7, s7, 0
806
+ ; GFX90A-NEXT: s_add_u32 s10, s10, s12
807
+ ; GFX90A-NEXT: s_addc_u32 s11, s11, s13
808
+ ; GFX90A-NEXT: s_mov_b64 s[14:15], 0
809
+ ; GFX90A-NEXT: s_branch .LBB3_1
810
+ ; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock
811
+ ; GFX90A-NEXT: s_endpgm
777
812
bb:
778
813
%i = load volatile i16 , ptr addrspace (4 ) undef , align 2
779
814
%i6 = zext i16 %i to i64
0 commit comments