Skip to content

Commit dc89a3e

Browse files
committed
HIP: Fix handling of denormal mode
I didn't realize HIP was a distinct offloading kind, so the subtarget was looking for -march, which isn't correct for HIP. We also have the possibility of different denormal defaults in the case of multiple offload targets, so we need to thread the JobAction through the target hook.
1 parent 0d4ec16 commit dc89a3e

File tree

10 files changed

+53
-32
lines changed

10 files changed

+53
-32
lines changed

clang/include/clang/Driver/ToolChain.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -636,8 +636,7 @@ class ToolChain {
636636
/// environment for the given \p FPType if given. Otherwise, the default
637637
/// assumed mode for any floating point type.
638638
virtual llvm::DenormalMode getDefaultDenormalModeForType(
639-
const llvm::opt::ArgList &DriverArgs,
640-
Action::OffloadKind DeviceOffloadKind,
639+
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
641640
const llvm::fltSemantics *FPType = nullptr) const {
642641
return llvm::DenormalMode::getIEEE();
643642
}

clang/lib/Driver/ToolChains/AMDGPU.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -273,18 +273,22 @@ bool AMDGPUToolChain::getDefaultDenormsAreZeroForTarget(
273273
}
274274

275275
llvm::DenormalMode AMDGPUToolChain::getDefaultDenormalModeForType(
276-
const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind,
276+
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
277277
const llvm::fltSemantics *FPType) const {
278278
// Denormals should always be enabled for f16 and f64.
279279
if (!FPType || FPType != &llvm::APFloat::IEEEsingle())
280280
return llvm::DenormalMode::getIEEE();
281281

282-
if (DeviceOffloadKind == Action::OFK_Cuda) {
282+
if (JA.getOffloadingDeviceKind() == Action::OFK_HIP ||
283+
JA.getOffloadingDeviceKind() == Action::OFK_Cuda) {
284+
auto Kind = llvm::AMDGPU::parseArchAMDGCN(JA.getOffloadingArch());
283285
if (FPType && FPType == &llvm::APFloat::IEEEsingle() &&
284286
DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
285287
options::OPT_fno_cuda_flush_denormals_to_zero,
286-
false))
288+
getDefaultDenormsAreZeroForTarget(Kind)))
287289
return llvm::DenormalMode::getPreserveSign();
290+
291+
return llvm::DenormalMode::getIEEE();
288292
}
289293

290294
const StringRef GpuArch = DriverArgs.getLastArgValue(options::OPT_mcpu_EQ);
@@ -294,7 +298,9 @@ llvm::DenormalMode AMDGPUToolChain::getDefaultDenormalModeForType(
294298
// them all?
295299
bool DAZ = DriverArgs.hasArg(options::OPT_cl_denorms_are_zero) ||
296300
getDefaultDenormsAreZeroForTarget(Kind);
297-
// Outputs are flushed to zero, preserving sign
301+
302+
// Outputs are flushed to zero (FTZ), preserving sign. Denormal inputs are
303+
// also implicit treated as zero (DAZ).
298304
return DAZ ? llvm::DenormalMode::getPreserveSign() :
299305
llvm::DenormalMode::getIEEE();
300306
}

clang/lib/Driver/ToolChains/AMDGPU.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,8 +214,7 @@ class LLVM_LIBRARY_VISIBILITY AMDGPUToolChain : public Generic_ELF {
214214
static bool getDefaultDenormsAreZeroForTarget(llvm::AMDGPU::GPUKind GPUKind);
215215

216216
llvm::DenormalMode getDefaultDenormalModeForType(
217-
const llvm::opt::ArgList &DriverArgs,
218-
Action::OffloadKind DeviceOffloadKind,
217+
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
219218
const llvm::fltSemantics *FPType = nullptr) const override;
220219
};
221220

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2510,7 +2510,7 @@ static void CollectArgsForIntegratedAssembler(Compilation &C,
25102510
static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
25112511
bool OFastEnabled, const ArgList &Args,
25122512
ArgStringList &CmdArgs,
2513-
Action::OffloadKind DeviceOffloadKind) {
2513+
const JobAction &JA) {
25142514
// Handle various floating point optimization flags, mapping them to the
25152515
// appropriate LLVM code generation flags. This is complicated by several
25162516
// "umbrella" flags, so we do this by stepping through the flags incrementally
@@ -2533,10 +2533,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
25332533
// -ffp-exception-behavior options: strict, maytrap, ignore
25342534
StringRef FPExceptionBehavior = "";
25352535
const llvm::DenormalMode DefaultDenormalFPMath =
2536-
TC.getDefaultDenormalModeForType(Args, DeviceOffloadKind);
2536+
TC.getDefaultDenormalModeForType(Args, JA);
25372537
const llvm::DenormalMode DefaultDenormalFP32Math =
2538-
TC.getDefaultDenormalModeForType(Args, DeviceOffloadKind,
2539-
&llvm::APFloat::IEEEsingle());
2538+
TC.getDefaultDenormalModeForType(Args, JA, &llvm::APFloat::IEEEsingle());
25402539

25412540
llvm::DenormalMode DenormalFPMath = DefaultDenormalFPMath;
25422541
llvm::DenormalMode DenormalFP32Math = DefaultDenormalFP32Math;
@@ -4295,7 +4294,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
42954294
CmdArgs.push_back("-mdisable-tail-calls");
42964295

42974296
RenderFloatingPointOptions(TC, D, isOptimizationLevelFast(Args), Args,
4298-
CmdArgs, JA.getOffloadingDeviceKind());
4297+
CmdArgs, JA);
42994298

43004299
// Render ABI arguments
43014300
switch (TC.getArch()) {
@@ -4618,8 +4617,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
46184617
if (Args.hasArg(options::OPT_fsplit_stack))
46194618
CmdArgs.push_back("-split-stacks");
46204619

4621-
RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs,
4622-
JA.getOffloadingDeviceKind());
4620+
RenderFloatingPointOptions(TC, D, OFastEnabled, Args, CmdArgs, JA);
46234621

46244622
if (Arg *A = Args.getLastArg(options::OPT_mdouble_EQ)) {
46254623
if (TC.getArch() == llvm::Triple::avr)

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -721,17 +721,17 @@ void CudaToolChain::addClangTargetOptions(
721721
}
722722

723723
llvm::DenormalMode CudaToolChain::getDefaultDenormalModeForType(
724-
const llvm::opt::ArgList &DriverArgs, Action::OffloadKind DeviceOffloadKind,
724+
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
725725
const llvm::fltSemantics *FPType) const {
726-
if (DeviceOffloadKind == Action::OFK_Cuda) {
726+
if (JA.getOffloadingDeviceKind() == Action::OFK_Cuda) {
727727
if (FPType && FPType == &llvm::APFloat::IEEEsingle() &&
728728
DriverArgs.hasFlag(options::OPT_fcuda_flush_denormals_to_zero,
729729
options::OPT_fno_cuda_flush_denormals_to_zero,
730730
false))
731731
return llvm::DenormalMode::getPreserveSign();
732732
}
733733

734-
assert(DeviceOffloadKind != Action::OFK_Host);
734+
assert(JA.getOffloadingDeviceKind() != Action::OFK_Host);
735735
return llvm::DenormalMode::getIEEE();
736736
}
737737

clang/lib/Driver/ToolChains/Cuda.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,7 @@ class LLVM_LIBRARY_VISIBILITY CudaToolChain : public ToolChain {
156156
Action::OffloadKind DeviceOffloadKind) const override;
157157

158158
llvm::DenormalMode getDefaultDenormalModeForType(
159-
const llvm::opt::ArgList &DriverArgs,
160-
Action::OffloadKind DeviceOffloadKind,
159+
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
161160
const llvm::fltSemantics *FPType = nullptr) const override;
162161

163162
// Never try to use the integrated assembler with CUDA; always fork out to

clang/lib/Driver/ToolChains/Linux.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -988,10 +988,10 @@ void Linux::addProfileRTLibs(const llvm::opt::ArgList &Args,
988988
ToolChain::addProfileRTLibs(Args, CmdArgs);
989989
}
990990

991-
llvm::DenormalMode Linux::getDefaultDenormalModeForType(
992-
const llvm::opt::ArgList &DriverArgs,
993-
Action::OffloadKind DeviceOffloadKind,
994-
const llvm::fltSemantics *FPType) const {
991+
llvm::DenormalMode
992+
Linux::getDefaultDenormalModeForType(const llvm::opt::ArgList &DriverArgs,
993+
const JobAction &JA,
994+
const llvm::fltSemantics *FPType) const {
995995
switch (getTriple().getArch()) {
996996
case llvm::Triple::x86:
997997
case llvm::Triple::x86_64: {

clang/lib/Driver/ToolChains/Linux.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,8 @@ class LLVM_LIBRARY_VISIBILITY Linux : public Generic_ELF {
4949
std::vector<std::string> ExtraOpts;
5050

5151
llvm::DenormalMode getDefaultDenormalModeForType(
52-
const llvm::opt::ArgList &DriverArgs,
53-
Action::OffloadKind DeviceOffloadKind,
54-
const llvm::fltSemantics *FPType = nullptr) const override;
52+
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
53+
const llvm::fltSemantics *FPType = nullptr) const override;
5554

5655
protected:
5756
Tool *buildAssembler() const override;

clang/lib/Driver/ToolChains/PS4CPU.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,8 @@ class LLVM_LIBRARY_VISIBILITY PS4CPU : public Generic_ELF {
9494
Action::OffloadKind DeviceOffloadingKind) const override;
9595

9696
llvm::DenormalMode getDefaultDenormalModeForType(
97-
const llvm::opt::ArgList &DriverArgs,
98-
Action::OffloadKind DeviceOffloadKind,
99-
const llvm::fltSemantics *FPType) const override {
97+
const llvm::opt::ArgList &DriverArgs, const JobAction &JA,
98+
const llvm::fltSemantics *FPType) const override {
10099
// DAZ and FTZ are on by default.
101100
return llvm::DenormalMode::getPreserveSign();
102101
}

clang/test/Driver/cuda-flush-denormals-to-zero.cu

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,28 @@
77
// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fcuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=FTZ %s
88
// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=sm_70 -fno-cuda-flush-denormals-to-zero -nocudainc -nocudalib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
99

10-
// Test explicit argument.
10+
// Test explicit argument, with CUDA offload kind
1111
// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s
1212
// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
13+
14+
// Test explicit argument, with HIP offload kind
15+
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s
16+
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
17+
1318
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -fcuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s
1419
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -fno-cuda-flush-denormals-to-zero -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
1520

16-
// Test the default changing with no argument based on the subtarget.
21+
// Test the default changing with no argument based on the subtarget in HIP mode
1722
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZ %s
1823
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
1924

25+
26+
// Test multiple offload archs with different defaults.
27+
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=MIXED-DEFAULT-MODE %s
28+
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell -fcuda-flush-denormals-to-zero --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=FTZX2 %s
29+
// RUN: %clang -x hip -no-canonical-prefixes -### -target x86_64-linux-gnu -c -march=haswell -fno-cuda-flush-denormals-to-zero --cuda-gpu-arch=gfx803 --cuda-gpu-arch=gfx900 -nocudainc -nogpulib %s 2>&1 | FileCheck -check-prefix=NOFTZ %s
30+
31+
2032
// CPUFTZ-NOT: -fdenormal-fp-math
2133

2234
// FTZ-NOT: -fdenormal-fp-math-f32=
@@ -25,3 +37,13 @@
2537
// The default of ieee is omitted
2638
// NOFTZ-NOT: "-fdenormal-fp-math"
2739
// NOFTZ-NOT: "-fdenormal-fp-math-f32"
40+
41+
// MIXED-DEFAULT-MODE-NOT: -denormal-fp-math
42+
// MIXED-DEFAULT-MODE: "-fdenormal-fp-math-f32=preserve-sign,preserve-sign"
43+
// MIXED-DEFAULT-MODE-SAME: "-target-cpu" "gfx803"
44+
// MIXED-DEFAULT-MODE-NOT: -denormal-fp-math
45+
46+
// FTZX2: "-fdenormal-fp-math-f32=preserve-sign,preserve-sign"
47+
// FTZX2-SAME: "-target-cpu" "gfx803"
48+
// FTZX2: "-fdenormal-fp-math-f32=preserve-sign,preserve-sign"
49+
// FTZX2-SAME: "-target-cpu" "gfx900"

0 commit comments

Comments
 (0)