Skip to content

Commit 0a3d755

Browse files
committed
[AMDGPU] Enable divergence-driven BFE selection
Detailed description: This change enables the bit field extract patterns selection to s_bfe_u32 or v_bfe_u32 dependent on the pattern root node divergence. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D110950
1 parent 91f0a6a commit 0a3d755

File tree

6 files changed

+119
-39
lines changed

6 files changed

+119
-39
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -641,8 +641,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
641641
uint32_t OffsetVal = Offset->getZExtValue();
642642
uint32_t WidthVal = Width->getZExtValue();
643643

644-
ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
645-
SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
644+
ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
645+
WidthVal));
646646
return;
647647
}
648648
case AMDGPUISD::DIV_SCALE: {
@@ -1947,9 +1947,17 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
19471947
return true;
19481948
}
19491949

1950-
SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
1950+
SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
19511951
SDValue Val, uint32_t Offset,
19521952
uint32_t Width) {
1953+
if (Val->isDivergent()) {
1954+
unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
1955+
SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
1956+
SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
1957+
1958+
return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
1959+
}
1960+
unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
19531961
// Transformation function, pack the offset and width of a BFE into
19541962
// the format expected by the S_BFE_I32 / S_BFE_U32. In the second
19551963
// source, bits [5:0] contain the offset and bits [22:16] the width.
@@ -1974,10 +1982,8 @@ void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
19741982

19751983
if (0 < BVal && BVal <= CVal && CVal < 32) {
19761984
bool Signed = N->getOpcode() == ISD::SRA;
1977-
unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
1978-
1979-
ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
1980-
32 - CVal));
1985+
ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
1986+
32 - CVal));
19811987
return;
19821988
}
19831989
}
@@ -2000,9 +2006,8 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
20002006

20012007
if (isMask_32(MaskVal)) {
20022008
uint32_t WidthVal = countPopulation(MaskVal);
2003-
2004-
ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
2005-
Srl.getOperand(0), ShiftVal, WidthVal));
2009+
ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2010+
WidthVal));
20062011
return;
20072012
}
20082013
}
@@ -2022,9 +2027,8 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
20222027

20232028
if (isMask_32(MaskVal)) {
20242029
uint32_t WidthVal = countPopulation(MaskVal);
2025-
2026-
ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
2027-
And.getOperand(0), ShiftVal, WidthVal));
2030+
ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2031+
WidthVal));
20282032
return;
20292033
}
20302034
}
@@ -2051,7 +2055,7 @@ void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
20512055
break;
20522056

20532057
unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2054-
ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
2058+
ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
20552059
Amt->getZExtValue(), Width));
20562060
return;
20572061
}

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
233233
void SelectMAD_64_32(SDNode *N);
234234
void SelectFMA_W_CHAIN(SDNode *N);
235235
void SelectFMUL_W_CHAIN(SDNode *N);
236-
237-
SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
238-
uint32_t Offset, uint32_t Width);
236+
SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset,
237+
uint32_t Width);
239238
void SelectS_BFEFromShifts(SDNode *N);
240239
void SelectS_BFE(SDNode *N);
241240
bool isCBranchSCC(const SDNode *N) const;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1871,40 +1871,92 @@ def : GCNPat <
18711871
// Conversion Patterns
18721872
//===----------------------------------------------------------------------===//
18731873

1874-
def : GCNPat<(i32 (sext_inreg i32:$src, i1)),
1874+
class UniformSextInreg<ValueType VT> : PatFrag<
1875+
(ops node:$src),
1876+
(sext_inreg $src, VT),
1877+
[{ return !N->isDivergent(); }]>;
1878+
1879+
def : GCNPat<(i32 (UniformSextInreg<i1> i32:$src)),
18751880
(S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
18761881

18771882
// Handle sext_inreg in i64
18781883
def : GCNPat <
1879-
(i64 (sext_inreg i64:$src, i1)),
1884+
(i64 (UniformSextInreg<i1> i64:$src)),
18801885
(S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
18811886
>;
18821887

18831888
def : GCNPat <
1884-
(i16 (sext_inreg i16:$src, i1)),
1889+
(i16 (UniformSextInreg<i1> i16:$src)),
18851890
(S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16
18861891
>;
18871892

18881893
def : GCNPat <
1889-
(i16 (sext_inreg i16:$src, i8)),
1894+
(i16 (UniformSextInreg<i8> i16:$src)),
18901895
(S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
18911896
>;
18921897

18931898
def : GCNPat <
1894-
(i64 (sext_inreg i64:$src, i8)),
1899+
(i64 (UniformSextInreg<i8> i64:$src)),
18951900
(S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
18961901
>;
18971902

18981903
def : GCNPat <
1899-
(i64 (sext_inreg i64:$src, i16)),
1904+
(i64 (UniformSextInreg<i16> i64:$src)),
19001905
(S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
19011906
>;
19021907

19031908
def : GCNPat <
1904-
(i64 (sext_inreg i64:$src, i32)),
1909+
(i64 (UniformSextInreg<i32> i64:$src)),
19051910
(S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
19061911
>;
19071912

1913+
1914+
class DivergentSextInreg<ValueType VT> : PatFrag<
1915+
(ops node:$src),
1916+
(sext_inreg $src, VT),
1917+
[{ return N->isDivergent(); }]>;
1918+
1919+
def : GCNPat<(i32 (DivergentSextInreg<i1> i32:$src)),
1920+
(V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
1921+
1922+
def : GCNPat <
1923+
(i16 (DivergentSextInreg<i1> i16:$src)),
1924+
(V_BFE_I32_e64 $src, (i32 0), (i32 1)) // 0 | 1 << 16
1925+
>;
1926+
1927+
def : GCNPat <
1928+
(i16 (DivergentSextInreg<i8> i16:$src)),
1929+
(V_BFE_I32_e64 $src, (i32 0), (i32 8)) // 0 | 8 << 16
1930+
>;
1931+
1932+
def : GCNPat <
1933+
(i64 (DivergentSextInreg<i1> i64:$src)),
1934+
(REG_SEQUENCE VReg_64,
1935+
(V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1)), sub0,
1936+
(V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1))), sub1)
1937+
>;
1938+
1939+
def : GCNPat <
1940+
(i64 (DivergentSextInreg<i8> i64:$src)),
1941+
(REG_SEQUENCE VReg_64,
1942+
(V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)/* 0 | 8 << 16 */), sub0,
1943+
(V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1)
1944+
>;
1945+
1946+
def : GCNPat <
1947+
(i64 (DivergentSextInreg<i16> i64:$src)),
1948+
(REG_SEQUENCE VReg_64,
1949+
(V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)/* 0 | 16 << 16 */), sub0,
1950+
(V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1)
1951+
>;
1952+
1953+
def : GCNPat <
1954+
(i64 (DivergentSextInreg<i32> i64:$src)),
1955+
(REG_SEQUENCE VReg_64,
1956+
(i32 (EXTRACT_SUBREG i64:$src, sub0)), sub0,
1957+
(V_ASHRREV_I32_e32 (i32 31), (i32 (EXTRACT_SUBREG i64:$src, sub0))), sub1)
1958+
>;
1959+
19081960
def : GCNPat <
19091961
(i64 (zext i32:$src)),
19101962
(REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2+
3+
; GCN_LABEL: @bfe_uniform
4+
; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010
5+
define amdgpu_kernel void @bfe_uniform(i32 %val, i32 addrspace(1)* %out) {
6+
%hibits = lshr i32 %val, 16
7+
%masked = and i32 %hibits, 15
8+
store i32 %masked, i32 addrspace(1)* %out
9+
ret void
10+
}
11+
12+
; GCN_LABEL: @bfe_divergent
13+
; GCN: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 4
14+
define amdgpu_kernel void @bfe_divergent(i32 %val, i32 addrspace(1)* %out) {
15+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
16+
%divergent = add i32 %val, %tid
17+
%hibits = lshr i32 %divergent, 16
18+
%masked = and i32 %hibits, 15
19+
store i32 %masked, i32 addrspace(1)* %out
20+
ret void
21+
}
22+
23+
24+
declare i32 @llvm.amdgcn.workitem.id.x()
25+

llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,14 +79,14 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
7979
; SI-NEXT: s_waitcnt lgkmcnt(0)
8080
; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
8181
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
82-
; SI-NEXT: s_movk_i32 s7, 0xfc01
82+
; SI-NEXT: s_movk_i32 s6, 0xfc01
8383
; SI-NEXT: s_mov_b32 s0, -1
8484
; SI-NEXT: s_mov_b32 s1, 0xfffff
85-
; SI-NEXT: s_brev_b32 s6, -2
85+
; SI-NEXT: s_brev_b32 s7, -2
8686
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
8787
; SI-NEXT: s_waitcnt vmcnt(0)
8888
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
89-
; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v4
89+
; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4
9090
; SI-NEXT: v_lshr_b64 v[4:5], s[0:1], v6
9191
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
9292
; SI-NEXT: v_not_b32_e32 v4, v4
@@ -100,7 +100,7 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
100100
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
101101
; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
102102
; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5]
103-
; SI-NEXT: v_bfi_b32 v2, s6, v8, v3
103+
; SI-NEXT: v_bfi_b32 v2, s7, v8, v3
104104
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
105105
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
106106
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc

llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,13 @@ define <3 x i1> @test_srem_vec(<3 x i31> %X) nounwind {
6262
; CHECK-NEXT: v_bfe_i32 v3, v2, 0, 31
6363
; CHECK-NEXT: v_bfe_i32 v4, v1, 0, 31
6464
; CHECK-NEXT: v_bfe_i32 v5, v0, 0, 31
65-
; CHECK-NEXT: s_mov_b32 s6, 0x38e38e39
66-
; CHECK-NEXT: s_mov_b32 s7, 0xc71c71c7
67-
; CHECK-NEXT: s_brev_b32 s4, -2
68-
; CHECK-NEXT: s_mov_b32 s5, 0x7ffffffd
69-
; CHECK-NEXT: v_mul_hi_i32 v5, v5, s6
70-
; CHECK-NEXT: v_mul_hi_i32 v4, v4, s6
71-
; CHECK-NEXT: v_mul_hi_i32 v3, v3, s7
65+
; CHECK-NEXT: s_mov_b32 s4, 0x38e38e39
66+
; CHECK-NEXT: s_mov_b32 s5, 0xc71c71c7
67+
; CHECK-NEXT: s_brev_b32 s6, -2
68+
; CHECK-NEXT: s_mov_b32 s7, 0x7ffffffd
69+
; CHECK-NEXT: v_mul_hi_i32 v5, v5, s4
70+
; CHECK-NEXT: v_mul_hi_i32 v4, v4, s4
71+
; CHECK-NEXT: v_mul_hi_i32 v3, v3, s5
7272
; CHECK-NEXT: v_lshrrev_b32_e32 v6, 31, v5
7373
; CHECK-NEXT: v_lshrrev_b32_e32 v5, 1, v5
7474
; CHECK-NEXT: v_lshrrev_b32_e32 v7, 31, v4
@@ -84,12 +84,12 @@ define <3 x i1> @test_srem_vec(<3 x i31> %X) nounwind {
8484
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
8585
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
8686
; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
87-
; CHECK-NEXT: v_and_b32_e32 v2, s4, v2
88-
; CHECK-NEXT: v_and_b32_e32 v1, s4, v1
89-
; CHECK-NEXT: v_and_b32_e32 v0, s4, v0
87+
; CHECK-NEXT: v_and_b32_e32 v2, s6, v2
88+
; CHECK-NEXT: v_and_b32_e32 v1, s6, v1
89+
; CHECK-NEXT: v_and_b32_e32 v0, s6, v0
9090
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
9191
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
92-
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
92+
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
9393
; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
9494
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v2
9595
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc

0 commit comments

Comments
 (0)