Skip to content

Commit 04185f0

Browse files
committed
AMDGPU: Fix broken denormal constant folding of canonicalize
This needs to consider the dynamic denormal mode. It should be possible to implement a runtime DAZ check with a canonicalize.
1 parent 715b127 commit 04185f0

File tree

3 files changed

+138
-9
lines changed

3 files changed

+138
-9
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10856,10 +10856,15 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
1085610856
return true;
1085710857

1085810858
if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
10859-
auto F = CFP->getValueAPF();
10859+
const auto &F = CFP->getValueAPF();
1086010860
if (F.isNaN() && F.isSignaling())
1086110861
return false;
10862-
return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType());
10862+
if (!F.isDenormal())
10863+
return true;
10864+
10865+
DenormalMode Mode =
10866+
DAG.getMachineFunction().getDenormalMode(F.getSemantics());
10867+
return Mode == DenormalMode::getIEEE();
1086310868
}
1086410869

1086510870
// If source is a result of another standard FP operation it is already in
@@ -10928,6 +10933,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
1092810933

1092910934
// snans will be quieted, so we only need to worry about denormals.
1093010935
if (Subtarget->supportsMinMaxDenormModes() ||
10936+
// FIXME: denormalsEnabledForType is broken for dynamic
1093110937
denormalsEnabledForType(DAG, Op.getValueType()))
1093210938
return true;
1093310939

@@ -11007,6 +11013,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
1100711013
[[fallthrough]];
1100811014
}
1100911015
default:
11016+
// FIXME: denormalsEnabledForType is broken for dynamic
1101011017
return denormalsEnabledForType(DAG, Op.getValueType()) &&
1101111018
DAG.isKnownNeverSNaN(Op);
1101211019
}
@@ -11028,8 +11035,11 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
1102811035
if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
1102911036
if (FCR->Value.isSignaling())
1103011037
return false;
11031-
return !FCR->Value.isDenormal() ||
11032-
denormalsEnabledForType(MRI.getType(FCR->VReg), MF);
11038+
if (!FCR->Value.isDenormal())
11039+
return true;
11040+
11041+
DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
11042+
return Mode == DenormalMode::getIEEE();
1103311043
}
1103411044

1103511045
if (MaxDepth == 0)
@@ -11072,6 +11082,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
1107211082
case AMDGPU::G_FMINNUM_IEEE:
1107311083
case AMDGPU::G_FMAXNUM_IEEE: {
1107411084
if (Subtarget->supportsMinMaxDenormModes() ||
11085+
// FIXME: denormalsEnabledForType is broken for dynamic
1107511086
denormalsEnabledForType(MRI.getType(Reg), MF))
1107611087
return true;
1107711088

@@ -11128,9 +11139,16 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
1112811139
SDValue SITargetLowering::getCanonicalConstantFP(
1112911140
SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
1113011141
// Flush denormals to 0 if not enabled.
11131-
if (C.isDenormal() && !denormalsEnabledForType(DAG, VT)) {
11132-
return DAG.getConstantFP(APFloat::getZero(C.getSemantics(),
11133-
C.isNegative()), SL, VT);
11142+
if (C.isDenormal()) {
11143+
DenormalMode Mode =
11144+
DAG.getMachineFunction().getDenormalMode(C.getSemantics());
11145+
if (Mode == DenormalMode::getPreserveSign()) {
11146+
return DAG.getConstantFP(
11147+
APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
11148+
}
11149+
11150+
if (Mode != DenormalMode::getIEEE())
11151+
return SDValue();
1113411152
}
1113511153

1113611154
if (C.isNaN()) {

llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ body: |
3636
$vgpr0 = COPY %1(s32)
3737
...
3838

39+
# FIXME: Mode fields are redundant and not considered.
3940
---
4041
name: test_denormal_fconstant
4142
tracksRegLiveness: true
@@ -49,8 +50,7 @@ body: |
4950
5051
; CHECK-LABEL: name: test_denormal_fconstant
5152
; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.618950e-319
52-
; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[C]]
53-
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](s64)
53+
; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[C]](s64)
5454
%0:_(s64) = G_FCONSTANT double 0x0000000000008000
5555
%1:_(s64) = G_FCANONICALIZE %0
5656
$vgpr0_vgpr1 = COPY %1(s64)

llvm/test/CodeGen/AMDGPU/fcanonicalize.ll

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,114 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
465465
ret void
466466
}
467467

468+
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic(ptr addrspace(1) %out) #5 {
469+
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
470+
; GFX678: ; %bb.0:
471+
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
472+
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
473+
; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
474+
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
475+
; GFX678-NEXT: v_mov_b32_e32 v0, s0
476+
; GFX678-NEXT: v_mov_b32_e32 v1, s1
477+
; GFX678-NEXT: flat_store_dword v[0:1], v2
478+
; GFX678-NEXT: s_endpgm
479+
;
480+
; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
481+
; GFX9: ; %bb.0:
482+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
483+
; GFX9-NEXT: s_mov_b32 s2, 0x7fffff
484+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
485+
; GFX9-NEXT: v_max_f32_e64 v1, s2, s2
486+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
487+
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
488+
; GFX9-NEXT: s_endpgm
489+
;
490+
; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
491+
; GFX11: ; %bb.0:
492+
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
493+
; GFX11-NEXT: v_mov_b32_e32 v0, 0
494+
; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff
495+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
496+
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
497+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
498+
; GFX11-NEXT: s_endpgm
499+
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
500+
store float %canonicalized, ptr addrspace(1) %out
501+
ret void
502+
}
503+
504+
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out(ptr addrspace(1) %out) #6 {
505+
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
506+
; GFX678: ; %bb.0:
507+
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
508+
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
509+
; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
510+
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
511+
; GFX678-NEXT: v_mov_b32_e32 v0, s0
512+
; GFX678-NEXT: v_mov_b32_e32 v1, s1
513+
; GFX678-NEXT: flat_store_dword v[0:1], v2
514+
; GFX678-NEXT: s_endpgm
515+
;
516+
; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
517+
; GFX9: ; %bb.0:
518+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
519+
; GFX9-NEXT: s_mov_b32 s2, 0x7fffff
520+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
521+
; GFX9-NEXT: v_max_f32_e64 v1, s2, s2
522+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
523+
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
524+
; GFX9-NEXT: s_endpgm
525+
;
526+
; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
527+
; GFX11: ; %bb.0:
528+
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
529+
; GFX11-NEXT: v_mov_b32_e32 v0, 0
530+
; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff
531+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
532+
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
533+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
534+
; GFX11-NEXT: s_endpgm
535+
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
536+
store float %canonicalized, ptr addrspace(1) %out
537+
ret void
538+
}
539+
540+
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in(ptr addrspace(1) %out) #7 {
541+
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
542+
; GFX678: ; %bb.0:
543+
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
544+
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
545+
; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
546+
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
547+
; GFX678-NEXT: v_mov_b32_e32 v0, s0
548+
; GFX678-NEXT: v_mov_b32_e32 v1, s1
549+
; GFX678-NEXT: flat_store_dword v[0:1], v2
550+
; GFX678-NEXT: s_endpgm
551+
;
552+
; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
553+
; GFX9: ; %bb.0:
554+
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
555+
; GFX9-NEXT: s_mov_b32 s2, 0x7fffff
556+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
557+
; GFX9-NEXT: v_max_f32_e64 v1, s2, s2
558+
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
559+
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
560+
; GFX9-NEXT: s_endpgm
561+
;
562+
; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
563+
; GFX11: ; %bb.0:
564+
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
565+
; GFX11-NEXT: v_mov_b32_e32 v0, 0
566+
; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff
567+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
568+
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
569+
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
570+
; GFX11-NEXT: s_endpgm
571+
%canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
572+
store float %canonicalized, ptr addrspace(1) %out
573+
ret void
574+
}
575+
468576
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 {
469577
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
470578
; GFX678: ; %bb.0:
@@ -2400,3 +2508,6 @@ attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign"
24002508
attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
24012509
attributes #3 = { nounwind "denormal-fp-math"="ieee,ieee" }
24022510
attributes #4 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
2511+
attributes #5 = { nounwind "denormal-fp-math-f32"="dynamic,dynamic" }
2512+
attributes #6 = { nounwind "denormal-fp-math-f32"="dynamic,ieee" }
2513+
attributes #7 = { nounwind "denormal-fp-math-f32"="ieee,dynamic" }

0 commit comments

Comments
 (0)