Skip to content

Commit 19cdd19

Browse files
committed
[AArch64][GlobalISel] Add heuristics for localizing G_CONSTANT.
This adds similar heuristics to G_GLOBAL_VALUE, querying the cost of materializing a specific constant in code size. Doing so prevents us from sinking constants which require multiple instructions to generate into use blocks. Code size savings on CTMark -Os: Program size.__text before after diff ClamAV/clamscan 381940.00 382052.00 0.0% lencod/lencod 428408.00 428428.00 0.0% SPASS/SPASS 411868.00 411876.00 0.0% kimwitu++/kc 449944.00 449944.00 0.0% Bullet/bullet 463588.00 463556.00 -0.0% sqlite3/sqlite3 284696.00 284668.00 -0.0% consumer-typeset/consumer-typeset 414492.00 414424.00 -0.0% 7zip/7zip-benchmark 595244.00 594972.00 -0.0% mafft/pairlocalalign 247512.00 247368.00 -0.1% tramp3d-v4/tramp3d-v4 372884.00 372044.00 -0.2% Geomean difference -0.0% Differential Revision: https://reviews.llvm.org/D130554
1 parent 2430156 commit 19cdd19

File tree

5 files changed

+161
-14
lines changed

5 files changed

+161
-14
lines changed

llvm/include/llvm/CodeGen/MachineRegisterInfo.h

+5
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,11 @@ class MachineRegisterInfo {
584584
/// multiple uses.
585585
bool hasOneNonDBGUser(Register RegNo) const;
586586

587+
588+
/// hasAtMostUses - Return true if the given register has at most \p MaxUsers
589+
/// non-debug user instructions.
590+
bool hasAtMostUserInstrs(Register Reg, unsigned MaxUsers) const;
591+
587592
/// replaceRegWith - Replace all instances of FromReg with ToReg in the
588593
/// machine function. This is like llvm-level X->replaceAllUsesWith(Y),
589594
/// except that it also changes any definitions of the register as well.

llvm/lib/CodeGen/MachineRegisterInfo.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,16 @@ bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
420420
return hasSingleElement(use_nodbg_instructions(RegNo));
421421
}
422422

423+
bool MachineRegisterInfo::hasAtMostUserInstrs(Register Reg,
424+
unsigned MaxUsers) const {
425+
unsigned NumUsers = 0;
426+
auto UI = use_instr_nodbg_begin(Reg), UE = use_instr_nodbg_end();
427+
for (; UI != UE && NumUsers < MaxUsers; ++UI)
428+
NumUsers++;
429+
// If we haven't reached the end yet then there are more than MaxUses users.
430+
return UI == UE;
431+
}
432+
423433
/// clearKillFlags - Iterate over all the uses of the given register and
424434
/// clear the kill flag from the MachineOperand. This function is used by
425435
/// optimization passes which extend register lifetimes and need only

llvm/lib/CodeGen/TargetLoweringBase.cpp

+1-14
Original file line numberDiff line numberDiff line change
@@ -2335,18 +2335,6 @@ bool TargetLoweringBase::shouldLocalize(const MachineInstr &MI,
23352335
llvm_unreachable("Unexpected remat cost");
23362336
};
23372337

2338-
// Helper to walk through uses and terminate if we've reached a limit. Saves
2339-
// us spending time traversing uses if all we want to know is if it's >= min.
2340-
auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) {
2341-
unsigned NumUses = 0;
2342-
auto UI = MRI.use_instr_nodbg_begin(Reg), UE = MRI.use_instr_nodbg_end();
2343-
for (; UI != UE && NumUses < MaxUses; ++UI) {
2344-
NumUses++;
2345-
}
2346-
// If we haven't reached the end yet then there are more than MaxUses users.
2347-
return UI == UE;
2348-
};
2349-
23502338
switch (MI.getOpcode()) {
23512339
default:
23522340
return false;
@@ -2363,8 +2351,7 @@ bool TargetLoweringBase::shouldLocalize(const MachineInstr &MI,
23632351
unsigned MaxUses = maxUses(RematCost);
23642352
if (MaxUses == UINT_MAX)
23652353
return true; // Remats are "free" so always localize.
2366-
bool B = isUsesAtMost(Reg, MaxUses);
2367-
return B;
2354+
return MRI.hasAtMostUserInstrs(Reg, MaxUses);
23682355
}
23692356
}
23702357
}

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+29
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "llvm/ADT/Twine.h"
3232
#include "llvm/Analysis/MemoryLocation.h"
3333
#include "llvm/Analysis/ObjCARCUtil.h"
34+
#include "llvm/Analysis/TargetTransformInfo.h"
3435
#include "llvm/Analysis/VectorUtils.h"
3536
#include "llvm/CodeGen/Analysis.h"
3637
#include "llvm/CodeGen/CallingConvLower.h"
@@ -75,6 +76,7 @@
7576
#include "llvm/Support/Compiler.h"
7677
#include "llvm/Support/Debug.h"
7778
#include "llvm/Support/ErrorHandling.h"
79+
#include "llvm/Support/InstructionCost.h"
7880
#include "llvm/Support/KnownBits.h"
7981
#include "llvm/Support/MachineValueType.h"
8082
#include "llvm/Support/MathExtras.h"
@@ -20789,6 +20791,21 @@ bool AArch64TargetLowering::needsFixedCatchObjects() const {
2078920791

2079020792
bool AArch64TargetLowering::shouldLocalize(
2079120793
const MachineInstr &MI, const TargetTransformInfo *TTI) const {
20794+
auto &MF = *MI.getMF();
20795+
auto &MRI = MF.getRegInfo();
20796+
auto maxUses = [](unsigned RematCost) {
20797+
// A cost of 1 means remats are basically free.
20798+
if (RematCost == 1)
20799+
return UINT_MAX;
20800+
if (RematCost == 2)
20801+
return 2U;
20802+
20803+
// Remat is too expensive, only sink if there's one user.
20804+
if (RematCost > 2)
20805+
return 1U;
20806+
llvm_unreachable("Unexpected remat cost");
20807+
};
20808+
2079220809
switch (MI.getOpcode()) {
2079320810
case TargetOpcode::G_GLOBAL_VALUE: {
2079420811
// On Darwin, TLS global vars get selected into function calls, which
@@ -20799,6 +20816,18 @@ bool AArch64TargetLowering::shouldLocalize(
2079920816
return false;
2080020817
break;
2080120818
}
20819+
case TargetOpcode::G_CONSTANT: {
20820+
auto *CI = MI.getOperand(1).getCImm();
20821+
APInt Imm = CI->getValue();
20822+
InstructionCost Cost = TTI->getIntImmCost(
20823+
Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
20824+
assert(Cost.isValid() && "Expected a valid imm cost");
20825+
20826+
unsigned RematCost = *Cost.getValue();
20827+
Register Reg = MI.getOperand(0).getReg();
20828+
unsigned MaxUses = maxUses(RematCost);
20829+
return MRI.hasAtMostUserInstrs(Reg, MaxUses);
20830+
}
2080220831
// If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
2080320832
// localizable.
2080420833
case AArch64::ADRP:

llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll

+116
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,119 @@ if.end:
114114
ret i32 0
115115
}
116116

117+
define i32 @imm_cost_too_large_cost_of_2() {
118+
; CHECK-LABEL: name: imm_cost_too_large_cost_of_2
119+
; CHECK: bb.1.entry:
120+
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
121+
; CHECK-NEXT: {{ $}}
122+
; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 -2228259
123+
; CHECK-NEXT: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2
124+
; CHECK-NEXT: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3
125+
; CHECK-NEXT: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
126+
; CHECK-NEXT: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1
127+
; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s32) from @var1)
128+
; CHECK-NEXT: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
129+
; CHECK-NEXT: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(ne), [[LOAD]](s32), [[C2]]
130+
; CHECK-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[ICMP]], [[C2]]
131+
; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.4
132+
; CHECK-NEXT: G_BR %bb.2
133+
; CHECK-NEXT: {{ $}}
134+
; CHECK-NEXT: bb.2.if.then:
135+
; CHECK-NEXT: successors: %bb.3(0x80000000)
136+
; CHECK-NEXT: {{ $}}
137+
; CHECK-NEXT: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2
138+
; CHECK-NEXT: G_STORE [[C]](s32), [[GV3]](p0) :: (store (s32) into @var2)
139+
; CHECK-NEXT: G_BR %bb.3
140+
; CHECK-NEXT: {{ $}}
141+
; CHECK-NEXT: bb.3.if.then2:
142+
; CHECK-NEXT: successors: %bb.4(0x80000000)
143+
; CHECK-NEXT: {{ $}}
144+
; CHECK-NEXT: [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1
145+
; CHECK-NEXT: G_STORE [[C]](s32), [[GV4]](p0) :: (store (s32) into @var1)
146+
; CHECK-NEXT: G_BR %bb.4
147+
; CHECK-NEXT: {{ $}}
148+
; CHECK-NEXT: bb.4.if.end:
149+
; CHECK-NEXT: [[GV5:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3
150+
; CHECK-NEXT: G_STORE [[C]](s32), [[GV5]](p0) :: (store (s32) into @var3)
151+
; CHECK-NEXT: [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
152+
; CHECK-NEXT: $w0 = COPY [[C3]](s32)
153+
; CHECK-NEXT: RET_ReallyLR implicit $w0
154+
entry:
155+
%0 = load i32, i32* @var1, align 4
156+
%cst1 = bitcast i32 -2228259 to i32
157+
%cmp = icmp eq i32 %0, 1
158+
br i1 %cmp, label %if.then, label %if.end
159+
160+
if.then:
161+
store i32 %cst1, i32* @var2
162+
br label %if.then2
163+
164+
if.then2:
165+
store i32 %cst1, i32* @var1
166+
br label %if.end
167+
168+
if.end:
169+
store i32 %cst1, i32* @var3
170+
ret i32 0
171+
}
172+
173+
define i64 @imm_cost_too_large_cost_of_4() {
174+
; CHECK-LABEL: name: imm_cost_too_large_cost_of_4
175+
; CHECK: bb.1.entry:
176+
; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.4(0x40000000)
177+
; CHECK-NEXT: {{ $}}
178+
; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 -2228259
179+
; CHECK-NEXT: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2_64
180+
; CHECK-NEXT: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3_64
181+
; CHECK-NEXT: [[C1:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 0
182+
; CHECK-NEXT: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1_64
183+
; CHECK-NEXT: [[LOAD:%[0-9]+]]:gpr(s64) = G_LOAD [[GV2]](p0) :: (dereferenceable load (s64) from @var1_64, align 4)
184+
; CHECK-NEXT: [[C2:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 1
185+
; CHECK-NEXT: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(ne), [[LOAD]](s64), [[C2]]
186+
; CHECK-NEXT: [[C3:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
187+
; CHECK-NEXT: [[AND:%[0-9]+]]:gpr(s32) = G_AND [[ICMP]], [[C3]]
188+
; CHECK-NEXT: G_BRCOND [[AND]](s32), %bb.4
189+
; CHECK-NEXT: G_BR %bb.2
190+
; CHECK-NEXT: {{ $}}
191+
; CHECK-NEXT: bb.2.if.then:
192+
; CHECK-NEXT: successors: %bb.3(0x80000000)
193+
; CHECK-NEXT: {{ $}}
194+
; CHECK-NEXT: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2_64
195+
; CHECK-NEXT: G_STORE [[C]](s64), [[GV3]](p0) :: (store (s64) into @var2_64)
196+
; CHECK-NEXT: G_BR %bb.3
197+
; CHECK-NEXT: {{ $}}
198+
; CHECK-NEXT: bb.3.if.then2:
199+
; CHECK-NEXT: successors: %bb.4(0x80000000)
200+
; CHECK-NEXT: {{ $}}
201+
; CHECK-NEXT: [[GV4:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1_64
202+
; CHECK-NEXT: G_STORE [[C]](s64), [[GV4]](p0) :: (store (s64) into @var1_64)
203+
; CHECK-NEXT: G_BR %bb.4
204+
; CHECK-NEXT: {{ $}}
205+
; CHECK-NEXT: bb.4.if.end:
206+
; CHECK-NEXT: [[GV5:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var3_64
207+
; CHECK-NEXT: G_STORE [[C]](s64), [[GV5]](p0) :: (store (s64) into @var3_64)
208+
; CHECK-NEXT: [[C4:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 0
209+
; CHECK-NEXT: $x0 = COPY [[C4]](s64)
210+
; CHECK-NEXT: RET_ReallyLR implicit $x0
211+
entry:
212+
%0 = load i64, i64* @var1_64, align 4
213+
%cst1 = bitcast i64 -2228259 to i64
214+
%cmp = icmp eq i64 %0, 1
215+
br i1 %cmp, label %if.then, label %if.end
216+
217+
if.then:
218+
store i64 %cst1, i64* @var2_64
219+
br label %if.then2
220+
221+
if.then2:
222+
store i64 %cst1, i64* @var1_64
223+
br label %if.end
224+
225+
if.end:
226+
store i64 %cst1, i64* @var3_64
227+
ret i64 0
228+
}
229+
230+
@var1_64 = common global i64 0, align 4
231+
@var2_64 = common global i64 0, align 4
232+
@var3_64 = common global i64 0, align 4

0 commit comments

Comments
 (0)