@@ -1392,27 +1392,29 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
1392
1392
; SI-NEXT: s_waitcnt lgkmcnt(0)
1393
1393
; SI-NEXT: s_mov_b64 s[12:13], s[4:5]
1394
1394
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[12:15], 0 addr64 offset:3
1395
- ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2
1396
- ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
1397
1395
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[12:15], 0 addr64 offset:2
1396
+ ; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
1397
+ ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[12:15], 0 addr64 offset:2
1398
1398
; SI-NEXT: s_mov_b32 s10, -1
1399
1399
; SI-NEXT: s_mov_b32 s8, s2
1400
1400
; SI-NEXT: s_mov_b32 s9, s3
1401
1401
; SI-NEXT: s_mov_b32 s2, s10
1402
1402
; SI-NEXT: s_mov_b32 s3, s11
1403
1403
; SI-NEXT: s_waitcnt vmcnt(2)
1404
- ; SI-NEXT: v_lshlrev_b32_e32 v5, 24 , v2
1404
+ ; SI-NEXT: v_lshlrev_b32_e32 v5, 8 , v2
1405
1405
; SI-NEXT: s_waitcnt vmcnt(1)
1406
- ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v3
1407
- ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3
1406
+ ; SI-NEXT: v_lshlrev_b32_e32 v6, 8, v4
1407
+ ; SI-NEXT: v_or_b32_e32 v5, v5, v4
1408
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v4
1408
1409
; SI-NEXT: s_waitcnt vmcnt(0)
1409
- ; SI-NEXT: v_or_b32_e32 v6, v4, v6
1410
+ ; SI-NEXT: v_or_b32_e32 v6, v3, v6
1411
+ ; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
1410
1412
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
1411
- ; SI-NEXT: v_alignbit_b32 v5, v3, v5, 24
1412
- ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v4
1413
+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
1413
1414
; SI-NEXT: v_mov_b32_e32 v3, v1
1414
- ; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v6
1415
- ; SI-NEXT: v_or_b32_e32 v4, v5, v4
1415
+ ; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
1416
+ ; SI-NEXT: v_alignbit_b32 v4, v4, v5, 24
1417
+ ; SI-NEXT: v_or_b32_e32 v4, v4, v6
1416
1418
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1417
1419
; SI-NEXT: buffer_store_dword v4, off, s[8:11], 0
1418
1420
; SI-NEXT: s_endpgm
@@ -1572,23 +1574,23 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
1572
1574
; SI-NEXT: s_mov_b32 s7, s3
1573
1575
; SI-NEXT: s_waitcnt vmcnt(0)
1574
1576
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
1577
+ ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4
1575
1578
; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
1576
1579
; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
1577
1580
; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
1578
1581
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1579
- ; SI-NEXT: v_add_i32_e32 v7, vcc, 9, v4
1580
- ; SI-NEXT: v_and_b32_e32 v6, 0xff00, v4
1582
+ ; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
1581
1583
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
1582
1584
; SI-NEXT: s_waitcnt expcnt(0)
1583
- ; SI-NEXT: v_and_b32_e32 v0, 0xff, v7
1584
- ; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v5
1585
+ ; SI-NEXT: v_and_b32_e32 v0, 0xff, v4
1586
+ ; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5
1587
+ ; SI-NEXT: v_and_b32_e32 v1, 0xff00, v5
1585
1588
; SI-NEXT: v_or_b32_e32 v0, v6, v0
1586
- ; SI-NEXT: v_and_b32_e32 v1, 0xff, v1
1587
- ; SI-NEXT: v_and_b32_e32 v4, 0xff000000, v4
1589
+ ; SI-NEXT: v_and_b32_e32 v2, 0xff, v2
1588
1590
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
1589
- ; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
1591
+ ; SI-NEXT: v_or_b32_e32 v1, v1, v2
1590
1592
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
1591
- ; SI-NEXT: v_or_b32_e32 v1, v4 , v1
1593
+ ; SI-NEXT: v_lshlrev_b32_e32 v1, 16 , v1
1592
1594
; SI-NEXT: v_or_b32_e32 v0, v1, v0
1593
1595
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
1594
1596
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -1600,7 +1602,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
1600
1602
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1601
1603
; VI-NEXT: s_mov_b32 s7, 0xf000
1602
1604
; VI-NEXT: s_mov_b32 s6, -1
1603
- ; VI-NEXT: v_mov_b32_e32 v5, 9
1604
1605
; VI-NEXT: s_waitcnt lgkmcnt(0)
1605
1606
; VI-NEXT: v_mov_b32_e32 v1, s3
1606
1607
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -1613,19 +1614,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
1613
1614
; VI-NEXT: s_mov_b32 s2, s6
1614
1615
; VI-NEXT: s_mov_b32 s3, s7
1615
1616
; VI-NEXT: s_waitcnt vmcnt(0)
1616
- ; VI-NEXT: v_lshrrev_b32_e32 v6, 24 , v4
1617
+ ; VI-NEXT: v_lshrrev_b32_e32 v5, 16 , v4
1617
1618
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
1618
1619
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
1619
1620
; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
1620
1621
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1621
- ; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4
1622
- ; VI-NEXT: v_add_u16_e32 v8, 9, v4
1623
- ; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1622
+ ; VI-NEXT: v_and_b32_e32 v6, 0xffffff00, v4
1623
+ ; VI-NEXT: v_add_u16_e32 v4, 9, v4
1624
1624
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
1625
1625
; VI-NEXT: s_nop 0
1626
- ; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6
1627
- ; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1628
- ; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1626
+ ; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v5
1627
+ ; VI-NEXT: v_add_u16_e32 v2, 9, v5
1628
+ ; VI-NEXT: v_or_b32_sdwa v0, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1629
+ ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1629
1630
; VI-NEXT: v_mov_b32_e32 v2, 0x900
1630
1631
; VI-NEXT: v_add_u16_e32 v0, 0x900, v0
1631
1632
; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -1637,18 +1638,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
1637
1638
; GFX10: ; %bb.0:
1638
1639
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1639
1640
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1640
- ; GFX10-NEXT: v_mov_b32_e32 v1, 24
1641
1641
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1642
1642
; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1643
1643
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
1644
1644
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1645
1645
; GFX10-NEXT: s_waitcnt vmcnt(0)
1646
- ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0
1647
- ; GFX10-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1646
+ ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1648
1647
; GFX10-NEXT: v_and_b32_e32 v3, 0xffffff00, v0
1649
1648
; GFX10-NEXT: v_add_nc_u16 v4, v0, 9
1650
- ; GFX10-NEXT: v_add_nc_u16 v2, v2, 9
1651
- ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1649
+ ; GFX10-NEXT: v_and_b32_e32 v2, 0xffffff00, v1
1650
+ ; GFX10-NEXT: v_add_nc_u16 v1, v1, 9
1651
+ ; GFX10-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1652
1652
; GFX10-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1653
1653
; GFX10-NEXT: v_mov_b32_e32 v4, 0
1654
1654
; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
@@ -1669,26 +1669,25 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
1669
1669
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34
1670
1670
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1671
1671
; GFX9-NEXT: v_mov_b32_e32 v5, 0
1672
- ; GFX9-NEXT: v_mov_b32_e32 v6, 9
1673
1672
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1674
1673
; GFX9-NEXT: global_load_dword v4, v0, s[0:1]
1675
1674
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
1676
1675
; GFX9-NEXT: s_movk_i32 s4, 0x900
1677
1676
; GFX9-NEXT: s_waitcnt vmcnt(0)
1678
- ; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24 , v4
1677
+ ; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16 , v4
1679
1678
; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
1680
1679
; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
1681
1680
; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
1682
1681
; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
1683
- ; GFX9-NEXT: v_and_b32_e32 v8, 0xffffff00, v4
1684
- ; GFX9-NEXT: v_add_u16_e32 v9, 9, v4
1685
- ; GFX9-NEXT: v_add_u16_sdwa v4, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1682
+ ; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff00, v4
1683
+ ; GFX9-NEXT: v_add_u16_e32 v4, 9, v4
1686
1684
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1687
1685
; GFX9-NEXT: global_store_dwordx4 v5, v[0:3], s[0:1]
1688
1686
; GFX9-NEXT: s_nop 0
1689
- ; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v7
1690
- ; GFX9-NEXT: v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1691
- ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1687
+ ; GFX9-NEXT: v_and_b32_e32 v1, 0xffffff00, v6
1688
+ ; GFX9-NEXT: v_add_u16_e32 v2, 9, v6
1689
+ ; GFX9-NEXT: v_or_b32_sdwa v0, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1690
+ ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
1692
1691
; GFX9-NEXT: v_add_u16_e32 v0, 0x900, v0
1693
1692
; GFX9-NEXT: v_add_u16_sdwa v1, v1, s4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1694
1693
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1705,29 +1704,27 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
1705
1704
; GFX11-NEXT: s_waitcnt vmcnt(0)
1706
1705
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
1707
1706
; GFX11-NEXT: v_add_nc_u16 v2, v0, 9
1708
- ; GFX11-NEXT: v_lshrrev_b32_e32 v3, 24, v0
1709
1707
; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff00, v0
1710
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4 ) | instskip(NEXT) | instid1(VALU_DEP_4 )
1711
- ; GFX11-NEXT: v_add_nc_u16 v1 , v1, 9
1708
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3 ) | instskip(NEXT) | instid1(VALU_DEP_3 )
1709
+ ; GFX11-NEXT: v_add_nc_u16 v3 , v1, 9
1712
1710
; GFX11-NEXT: v_and_b32_e32 v2, 0xff, v2
1713
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
1714
- ; GFX11-NEXT: v_lshlrev_b16 v3, 8, v3
1715
- ; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1
1716
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1711
+ ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff00, v1
1712
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1713
+ ; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v3
1717
1714
; GFX11-NEXT: v_or_b32_e32 v2, v4, v2
1718
1715
; GFX11-NEXT: v_mov_b32_e32 v4, 0
1719
- ; GFX11-NEXT: v_or_b32_e32 v1, v3, v1
1720
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1716
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1717
+ ; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
1721
1718
; GFX11-NEXT: v_add_nc_u16 v2, v2, 0x900
1722
1719
; GFX11-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
1720
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
1723
1721
; GFX11-NEXT: v_add_nc_u16 v1, v1, 0x900
1724
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1725
1722
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v2
1726
1723
; GFX11-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
1724
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
1727
1725
; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v1
1728
1726
; GFX11-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
1729
1727
; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1730
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3)
1731
1728
; GFX11-NEXT: v_or_b32_e32 v5, v5, v6
1732
1729
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1733
1730
; GFX11-NEXT: s_clause 0x1
0 commit comments