Skip to content

Commit 9d60e95

Browse files
authored
[AMDGPU] Use poison instead of undef for non-demanded elements (llvm#75914)
Return poison instead of undef for non-demanded lanes in the AMDGPU demanded element simplification hook. Also bail out of dmask is 0, as this case has special semantics: > If DMASK==0, the TA overrides DMASK=1 and puts zeros in VGPR followed by > LWE status if exists. TFE status is not generated since the fetch is dropped.
1 parent 4c83c27 commit 9d60e95

File tree

3 files changed

+16
-8
lines changed

3 files changed

+16
-8
lines changed

llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp

+6-2
Original file line numberDiff line numberDiff line change
@@ -1241,6 +1241,10 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
12411241
ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
12421242
unsigned DMaskVal = DMask->getZExtValue() & 0xf;
12431243

1244+
// dmask 0 has special semantics, do not simplify.
1245+
if (DMaskVal == 0)
1246+
return nullptr;
1247+
12441248
// Mask off values that are undefined because the dmask doesn't cover them
12451249
DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
12461250

@@ -1261,7 +1265,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
12611265

12621266
unsigned NewNumElts = DemandedElts.popcount();
12631267
if (!NewNumElts)
1264-
return UndefValue::get(IIVTy);
1268+
return PoisonValue::get(IIVTy);
12651269

12661270
if (NewNumElts >= VWidth && DemandedElts.isMask()) {
12671271
if (DMaskIdx >= 0)
@@ -1299,7 +1303,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
12991303

13001304
if (IsLoad) {
13011305
if (NewNumElts == 1) {
1302-
return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
1306+
return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
13031307
DemandedElts.countr_zero());
13041308
}
13051309

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts-inseltpoison.ll

+5-3
Original file line numberDiff line numberDiff line change
@@ -4792,7 +4792,9 @@ define amdgpu_ps float @extract_elt0_image_sample_2d_v4f32_f32(float %s, float %
47924792

47934793
define amdgpu_ps float @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(float %s, float %t, float %r, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
47944794
; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(
4795-
; CHECK-NEXT: ret float undef
4795+
; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float [[S:%.*]], float [[T:%.*]], float [[R:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
4796+
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
4797+
; CHECK-NEXT: ret float [[ELT0]]
47964798
;
47974799
%data = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float %s, float %t, float %r, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
47984800
%elt0 = extractelement <4 x float> %data, i32 0
@@ -4872,7 +4874,7 @@ define amdgpu_ps float @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(float
48724874
define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
48734875
; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(
48744876
; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
4875-
; CHECK-NEXT: [[SHUF:%.*]] = insertelement <2 x float> <float poison, float undef>, float [[DATA]], i64 0
4877+
; CHECK-NEXT: [[SHUF:%.*]] = insertelement <2 x float> poison, float [[DATA]], i64 0
48764878
; CHECK-NEXT: ret <2 x float> [[SHUF]]
48774879
;
48784880
%data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
@@ -4913,7 +4915,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32
49134915
define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
49144916
; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(
49154917
; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
4916-
; CHECK-NEXT: [[SHUF:%.*]] = insertelement <3 x float> <float poison, float undef, float undef>, float [[DATA]], i64 0
4918+
; CHECK-NEXT: [[SHUF:%.*]] = insertelement <3 x float> poison, float [[DATA]], i64 0
49174919
; CHECK-NEXT: ret <3 x float> [[SHUF]]
49184920
;
49194921
%data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)

llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll

+5-3
Original file line numberDiff line numberDiff line change
@@ -4791,7 +4791,9 @@ define amdgpu_ps float @extract_elt0_image_sample_2d_v4f32_f32(float %s, float %
47914791

47924792
define amdgpu_ps float @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(float %s, float %t, float %r, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
47934793
; CHECK-LABEL: @extract_elt0_dmask_0000_image_sample_3d_v4f32_f32(
4794-
; CHECK-NEXT: ret float undef
4794+
; CHECK-NEXT: [[DATA:%.*]] = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float [[S:%.*]], float [[T:%.*]], float [[R:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
4795+
; CHECK-NEXT: [[ELT0:%.*]] = extractelement <4 x float> [[DATA]], i64 0
4796+
; CHECK-NEXT: ret float [[ELT0]]
47954797
;
47964798
%data = call <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32 0, float %s, float %t, float %r, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
47974799
%elt0 = extractelement <4 x float> %data, i32 0
@@ -4871,7 +4873,7 @@ define amdgpu_ps float @extract_elt0_dmask_0111_image_sample_1d_v4f32_f32(float
48714873
define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
48724874
; CHECK-LABEL: @extract_elt0_elt1_dmask_0001_image_sample_1d_v4f32_f32(
48734875
; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
4874-
; CHECK-NEXT: [[SHUF:%.*]] = insertelement <2 x float> <float poison, float undef>, float [[DATA]], i64 0
4876+
; CHECK-NEXT: [[SHUF:%.*]] = insertelement <2 x float> poison, float [[DATA]], i64 0
48754877
; CHECK-NEXT: ret <2 x float> [[SHUF]]
48764878
;
48774879
%data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
@@ -4912,7 +4914,7 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_dmask_0101_image_sample_1d_v4f32
49124914
define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(float %s, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
49134915
; CHECK-LABEL: @extract_elt0_elt1_elt2_dmask_0001_image_sample_1d_v4f32_f32(
49144916
; CHECK-NEXT: [[DATA:%.*]] = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float [[S:%.*]], <8 x i32> [[SAMPLER:%.*]], <4 x i32> [[RSRC:%.*]], i1 false, i32 0, i32 0)
4915-
; CHECK-NEXT: [[SHUF:%.*]] = insertelement <3 x float> <float poison, float undef, float undef>, float [[DATA]], i64 0
4917+
; CHECK-NEXT: [[SHUF:%.*]] = insertelement <3 x float> poison, float [[DATA]], i64 0
49164918
; CHECK-NEXT: ret <3 x float> [[SHUF]]
49174919
;
49184920
%data = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 1, float %s, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)

0 commit comments

Comments
 (0)