Skip to content

Commit 3c5e423

Browse files
committed
AMDGPU: Make some packed shuffles free
VOP3P instructions can encode access to either half of the register. llvm-svn: 302730
1 parent acdc765 commit 3c5e423

File tree

5 files changed

+155
-42
lines changed

5 files changed

+155
-42
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,13 +363,22 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
363363
unsigned Index) {
364364
switch (Opcode) {
365365
case Instruction::ExtractElement:
366-
case Instruction::InsertElement:
366+
case Instruction::InsertElement: {
367+
unsigned EltSize
368+
= DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
369+
if (EltSize < 32) {
370+
if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
371+
return 0;
372+
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
373+
}
374+
367375
// Extracts are just reads of a subregister, so are free. Inserts are
368376
// considered free because we don't want to have any cost for scalarizing
369377
// operations, and we don't have to copy into a different register class.
370378

371379
// Dynamic indexing isn't free and is best avoided.
372380
return Index == ~0u ? 2 : 0;
381+
}
373382
default:
374383
return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
375384
}
@@ -479,3 +488,26 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
479488

480489
return false;
481490
}
491+
492+
unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
493+
Type *SubTp) {
494+
if (ST->hasVOP3PInsts()) {
495+
VectorType *VT = cast<VectorType>(Tp);
496+
if (VT->getNumElements() == 2 &&
497+
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
498+
// With op_sel VOP3P instructions freely can access the low half or high
499+
// half of a register, so any swizzle is free.
500+
501+
switch (Kind) {
502+
case TTI::SK_Broadcast:
503+
case TTI::SK_Reverse:
504+
case TTI::SK_PermuteSingleSrc:
505+
return 0;
506+
default:
507+
break;
508+
}
509+
}
510+
}
511+
512+
return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
513+
}

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
114114
}
115115

116116
unsigned getVectorSplitCost() { return 0; }
117+
118+
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
119+
Type *SubTp);
117120
};
118121

119122
} // end namespace llvm
Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,45 @@
1-
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
1+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s
2+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,VI %s
3+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX9 %s
24

3-
; CHECK: 'extractelement_v2i32'
4-
; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32>
5+
; GCN: 'extractelement_v2i32'
6+
; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i32>
57
define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
68
%vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
79
%elt = extractelement <2 x i32> %vec, i32 1
810
store i32 %elt, i32 addrspace(1)* %out
911
ret void
1012
}
1113

12-
; CHECK: 'extractelement_v2f32'
13-
; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float>
14+
; GCN: 'extractelement_v2f32'
15+
; GCN: estimated cost of 0 for {{.*}} extractelement <2 x float>
1416
define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
1517
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
1618
%elt = extractelement <2 x float> %vec, i32 1
1719
store float %elt, float addrspace(1)* %out
1820
ret void
1921
}
2022

21-
; CHECK: 'extractelement_v3i32'
22-
; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32>
23+
; GCN: 'extractelement_v3i32'
24+
; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i32>
2325
define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
2426
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
2527
%elt = extractelement <3 x i32> %vec, i32 1
2628
store i32 %elt, i32 addrspace(1)* %out
2729
ret void
2830
}
2931

30-
; CHECK: 'extractelement_v4i32'
31-
; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32>
32+
; GCN: 'extractelement_v4i32'
33+
; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i32>
3234
define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
3335
%vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
3436
%elt = extractelement <4 x i32> %vec, i32 1
3537
store i32 %elt, i32 addrspace(1)* %out
3638
ret void
3739
}
3840

39-
; CHECK: 'extractelement_v8i32'
40-
; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32>
41+
; GCN: 'extractelement_v8i32'
42+
; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32>
4143
define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
4244
%vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
4345
%elt = extractelement <8 x i32> %vec, i32 1
@@ -46,65 +48,85 @@ define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32
4648
}
4749

4850
; FIXME: Should be non-0
49-
; CHECK: 'extractelement_v8i32_dynindex'
50-
; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32>
51+
; GCN: 'extractelement_v8i32_dynindex'
52+
; GCN: estimated cost of 2 for {{.*}} extractelement <8 x i32>
5153
define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
5254
%vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
5355
%elt = extractelement <8 x i32> %vec, i32 %idx
5456
store i32 %elt, i32 addrspace(1)* %out
5557
ret void
5658
}
5759

58-
; CHECK: 'extractelement_v2i64'
59-
; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64>
60+
; GCN: 'extractelement_v2i64'
61+
; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i64>
6062
define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
6163
%vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
6264
%elt = extractelement <2 x i64> %vec, i64 1
6365
store i64 %elt, i64 addrspace(1)* %out
6466
ret void
6567
}
6668

67-
; CHECK: 'extractelement_v3i64'
68-
; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64>
69+
; GCN: 'extractelement_v3i64'
70+
; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i64>
6971
define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
7072
%vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
7173
%elt = extractelement <3 x i64> %vec, i64 1
7274
store i64 %elt, i64 addrspace(1)* %out
7375
ret void
7476
}
7577

76-
; CHECK: 'extractelement_v4i64'
77-
; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64>
78+
; GCN: 'extractelement_v4i64'
79+
; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i64>
7880
define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
7981
%vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
8082
%elt = extractelement <4 x i64> %vec, i64 1
8183
store i64 %elt, i64 addrspace(1)* %out
8284
ret void
8385
}
8486

85-
; CHECK: 'extractelement_v8i64'
86-
; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64>
87+
; GCN: 'extractelement_v8i64'
88+
; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i64>
8789
define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
8890
%vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
8991
%elt = extractelement <8 x i64> %vec, i64 1
9092
store i64 %elt, i64 addrspace(1)* %out
9193
ret void
9294
}
9395

94-
; CHECK: 'extractelement_v4i8'
95-
; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8>
96+
; GCN: 'extractelement_v4i8'
97+
; GCN: estimated cost of 1 for {{.*}} extractelement <4 x i8>
9698
define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
9799
%vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
98100
%elt = extractelement <4 x i8> %vec, i8 1
99101
store i8 %elt, i8 addrspace(1)* %out
100102
ret void
101103
}
102104

103-
; CHECK: 'extractelement_v2i16'
104-
; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16>
105-
define amdgpu_kernel void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
105+
; GCN: 'extractelement_0_v2i16':
106+
; CI: estimated cost of 1 for {{.*}} extractelement <2 x i16> %vec, i16 0
107+
; VI: estimated cost of 0 for {{.*}} extractelement <2 x i16>
108+
; GFX9: estimated cost of 0 for {{.*}} extractelement <2 x i16>
109+
define amdgpu_kernel void @extractelement_0_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
110+
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
111+
%elt = extractelement <2 x i16> %vec, i16 0
112+
store i16 %elt, i16 addrspace(1)* %out
113+
ret void
114+
}
115+
116+
; GCN: 'extractelement_1_v2i16':
117+
; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
118+
define amdgpu_kernel void @extractelement_1_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
106119
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
107120
%elt = extractelement <2 x i16> %vec, i16 1
108121
store i16 %elt, i16 addrspace(1)* %out
109122
ret void
110123
}
124+
125+
; GCN: 'extractelement_var_v2i16'
126+
; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
127+
define amdgpu_kernel void @extractelement_var_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, i32 %idx) {
128+
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
129+
%elt = extractelement <2 x i16> %vec, i32 %idx
130+
store i16 %elt, i16 addrspace(1)* %out
131+
ret void
132+
}
Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,50 @@
1-
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
1+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s
2+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,VI %s
3+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX9 %s
24

3-
; CHECK: 'insertelement_v2i32'
4-
; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32>
5+
; GCN-LABEL: 'insertelement_v2i32'
6+
; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i32>
57
define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
68
%vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
7-
%insert = insertelement <2 x i32> %vec, i32 1, i32 123
9+
%insert = insertelement <2 x i32> %vec, i32 123, i32 1
810
store <2 x i32> %insert, <2 x i32> addrspace(1)* %out
911
ret void
1012
}
1113

12-
; CHECK: 'insertelement_v2i64'
13-
; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64>
14+
; GCN-LABEL: 'insertelement_v2i64'
15+
; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i64>
1416
define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
1517
%vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
16-
%insert = insertelement <2 x i64> %vec, i64 1, i64 123
18+
%insert = insertelement <2 x i64> %vec, i64 123, i64 1
1719
store <2 x i64> %insert, <2 x i64> addrspace(1)* %out
1820
ret void
1921
}
2022

21-
; CHECK: 'insertelement_v2i16'
22-
; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16>
23-
define amdgpu_kernel void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
23+
; GCN-LABEL: 'insertelement_0_v2i16'
24+
; CI: estimated cost of 1 for {{.*}} insertelement <2 x i16>
25+
; VI: estimated cost of 0 for {{.*}} insertelement <2 x i16>
26+
; GFX9: estimated cost of 0 for {{.*}} insertelement <2 x i16>
27+
define amdgpu_kernel void @insertelement_0_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
2428
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
25-
%insert = insertelement <2 x i16> %vec, i16 1, i16 123
29+
%insert = insertelement <2 x i16> %vec, i16 123, i16 0
2630
store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
2731
ret void
2832
}
2933

30-
; CHECK: 'insertelement_v2i8'
31-
; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8>
32-
define amdgpu_kernel void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
34+
; GCN-LABEL: 'insertelement_1_v2i16'
35+
; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i16>
36+
define amdgpu_kernel void @insertelement_1_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
37+
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
38+
%insert = insertelement <2 x i16> %vec, i16 123, i16 1
39+
store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
40+
ret void
41+
}
42+
43+
; GCN-LABEL: 'insertelement_1_v2i8'
44+
; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i8>
45+
define amdgpu_kernel void @insertelement_1_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
3346
%vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr
34-
%insert = insertelement <2 x i8> %vec, i8 1, i8 123
47+
%insert = insertelement <2 x i8> %vec, i8 123, i8 1
3548
store <2 x i8> %insert, <2 x i8> addrspace(1)* %out
3649
ret void
3750
}
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GFX9,GCN %s
2+
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=VI,GCN %s
3+
4+
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
5+
define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
6+
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
7+
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer
8+
store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
9+
ret void
10+
}
11+
12+
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
13+
define amdgpu_kernel void @shufflevector_01_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
14+
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
15+
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 0, i32 1>
16+
store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
17+
ret void
18+
}
19+
20+
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
21+
define amdgpu_kernel void @shufflevector_10_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
22+
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
23+
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
24+
store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
25+
ret void
26+
}
27+
28+
; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
29+
define amdgpu_kernel void @shufflevector_11_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
30+
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
31+
%shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> <i32 1, i32 1>
32+
store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
33+
ret void
34+
}
35+
36+
; GCN: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
37+
define amdgpu_kernel void @shufflevector_02_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr0, <2 x i16> addrspace(1)* %vaddr1) {
38+
%vec0 = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr0
39+
%vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr1
40+
%shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> <i32 0, i32 2>
41+
store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out
42+
ret void
43+
}

0 commit comments

Comments
 (0)