Skip to content

Commit beb12f9

Browse files
authored
[RISCV] Add +optimized-nfN-segment-load-store (llvm#114414)
This is a follow up to llvm#111511, where after benchmarking we learnt that the Banana Pi F3 has fast segmented loads for not just NF=2, but also NF=3 and NF=4: https://github.com/preames/bp3-microarch#vlseg_lmul_x_sew_throughput This adds tuning features to allow these segment loads and stores to be costed cheaper and enables it for the spacemit-x60. It also enables +optimized-nf2-segment-load-store by default in the generic tuning to maintain the previous behaviour when compiled without -mcpu or -mtune.
1 parent 6bad451 commit beb12f9

File tree

4 files changed

+210
-87
lines changed

4 files changed

+210
-87
lines changed

llvm/lib/Target/RISCV/RISCVFeatures.td

+7
Original file line numberDiff line numberDiff line change
@@ -1386,6 +1386,13 @@ def TuneOptimizedZeroStrideLoad
13861386
"true", "Optimized (perform fewer memory operations)"
13871387
"zero-stride vector load">;
13881388

1389+
foreach nf = {2-8} in
1390+
def TuneOptimizedNF#nf#SegmentLoadStore :
1391+
SubtargetFeature<"optimized-nf"#nf#"-segment-load-store",
1392+
"HasOptimizedNF"#nf#"SegmentLoadStore",
1393+
"true", "vlseg"#nf#"eN.v and vsseg"#nf#"eN.v are"
1394+
"implemented as a wide memory op and shuffle">;
1395+
13891396
def Experimental
13901397
: SubtargetFeature<"experimental", "HasExperimental",
13911398
"true", "Experimental intrinsics">;

llvm/lib/Target/RISCV/RISCVProcessors.td

+12-4
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,19 @@ class RISCVTuneProcessorModel<string n,
5757
list<SubtargetFeature> f = []>
5858
: ProcessorModel<n, m, f,tunef>;
5959

60+
defvar GenericTuneFeatures = [TuneOptimizedNF2SegmentLoadStore];
61+
6062
def GENERIC_RV32 : RISCVProcessorModel<"generic-rv32",
6163
NoSchedModel,
6264
[Feature32Bit,
63-
FeatureStdExtI]>,
65+
FeatureStdExtI],
66+
GenericTuneFeatures>,
6467
GenericTuneInfo;
6568
def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64",
6669
NoSchedModel,
6770
[Feature64Bit,
68-
FeatureStdExtI]>,
71+
FeatureStdExtI],
72+
GenericTuneFeatures>,
6973
GenericTuneInfo;
7074
// Support generic for compatibility with other targets. The triple will be used
7175
// to change to the appropriate rv32/rv64 version.
@@ -221,7 +225,8 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
221225

222226
defvar SiFiveX280TuneFeatures = !listconcat(SiFive7TuneFeatures,
223227
[TuneDLenFactor2,
224-
TuneOptimizedZeroStrideLoad]);
228+
TuneOptimizedZeroStrideLoad,
229+
TuneOptimizedNF2SegmentLoadStore]);
225230
def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
226231
[Feature64Bit,
227232
FeatureStdExtI,
@@ -472,7 +477,10 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
472477
FeatureStdExtZvfh,
473478
FeatureStdExtZvkt,
474479
FeatureStdExtZvl256b]),
475-
[TuneDLenFactor2]>;
480+
[TuneDLenFactor2,
481+
TuneOptimizedNF2SegmentLoadStore,
482+
TuneOptimizedNF3SegmentLoadStore,
483+
TuneOptimizedNF4SegmentLoadStore]>;
476484

477485
def RP2350_HAZARD3 : RISCVProcessorModel<"rp2350-hazard3",
478486
NoSchedModel,

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

+25-3
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,28 @@ RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
716716
return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
717717
}
718718

719+
static bool hasOptimizedSegmentLoadStore(unsigned NF,
720+
const RISCVSubtarget *ST) {
721+
switch (NF) {
722+
case 2:
723+
return ST->hasOptimizedNF2SegmentLoadStore();
724+
case 3:
725+
return ST->hasOptimizedNF3SegmentLoadStore();
726+
case 4:
727+
return ST->hasOptimizedNF4SegmentLoadStore();
728+
case 5:
729+
return ST->hasOptimizedNF5SegmentLoadStore();
730+
case 6:
731+
return ST->hasOptimizedNF6SegmentLoadStore();
732+
case 7:
733+
return ST->hasOptimizedNF7SegmentLoadStore();
734+
case 8:
735+
return ST->hasOptimizedNF8SegmentLoadStore();
736+
default:
737+
llvm_unreachable("Unexpected NF");
738+
}
739+
}
740+
719741
InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
720742
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
721743
Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
@@ -737,9 +759,9 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost(
737759
TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
738760
AddressSpace, DL)) {
739761

740-
// Most available hardware today optimizes NF=2 as as one wide memory op
741-
// + Factor * LMUL shuffle ops.
742-
if (Factor == 2) {
762+
// Some processors optimize segment loads/stores as one wide memory op +
763+
// Factor * LMUL shuffle ops.
764+
if (hasOptimizedSegmentLoadStore(Factor, ST)) {
743765
InstructionCost Cost =
744766
getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
745767
MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();

0 commit comments

Comments
 (0)