Skip to content

Commit 8d22100

Browse files
committed
[LICM] Support hosting of dynamic allocas out of loops
This patch implements a correct, but not terribly useful, transform. In particular, if we have a dynamic alloca in a loop which is guaranteed to execute, and provably not captured, we hoist the alloca out of the loop. The capture tracking is needed so that we can prove that each previous stack region dies before the next one is allocated. The transform decreases the amount of stack allocation needed by a linear factor (e.g. the iteration count of the loop). Now, I really hope no one is actually using dynamic allocas. As such, why this patch? Well, the actual problem I'm hoping to make progress on is allocation hoisting. There's a large draft patch out for review (https://reviews.llvm.org/D60056), and this patch was the smallest chunk of testable functionality I could come up with which takes a step vaguely in that direction. Once this is in, it makes motivating the changes to capture tracking mentioned in TODOs testable. After that, I hope to extend this to trivial malloc free regions (i.e. free dominating all loop exits) and allocation functions for GCed languages. Differential Revision: https://reviews.llvm.org/D69227
1 parent 787dba7 commit 8d22100

File tree

2 files changed

+213
-0
lines changed

2 files changed

+213
-0
lines changed

llvm/lib/Transforms/Scalar/LICM.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,41 @@ class ControlFlowHoister {
789789
};
790790
} // namespace
791791

792+
793+
/// Return true if we know how to rewrite all uses of the given alloca after
794+
/// hoisting it out of the loop. The main concerns are a) potential captures
795+
/// and b) invariant.start markers which don't capture, but are no longer
796+
/// valid w/o a corresponding invariant.end.
797+
static bool canRewriteUsesOfAlloca(AllocaInst &AI) {
798+
// TODO: This looks a lot like capture tracking, but we need to remove any
799+
// invariant starts if we extend the lifetime of the alloca by hoisting it.
800+
// We should probably refactor capture tracking into a form which allows us
801+
// to reuse the relevant bits and remove the duplicated logic here.
802+
803+
SmallVector<Use *, 16> Worklist;
804+
for (Use &U : AI.uses())
805+
Worklist.push_back(&U);
806+
807+
unsigned NumUsesExplored = 0;
808+
while (!Worklist.empty()) {
809+
Use *U = Worklist.pop_back_val();
810+
Instruction *I = cast<Instruction>(U->getUser());
811+
NumUsesExplored++;
812+
if (NumUsesExplored > DefaultMaxUsesToExplore)
813+
return false;
814+
// Non capturing, terminating uses
815+
if (isa<LoadInst>(I) ||
816+
(isa<StoreInst>(I) && U->getOperandNo() == 1))
817+
continue;
818+
// Non capturing, non-terminating
819+
if (!isa<BitCastInst>(I) && !isa<GetElementPtrInst>(I))
820+
return false;
821+
for (Use &U : I->uses())
822+
Worklist.push_back(&U);
823+
}
824+
return true;
825+
}
826+
792827
/// Walk the specified region of the CFG (defined by all blocks dominated by
793828
/// the specified block, and that are in the current loop) in depth first
794829
/// order w.r.t the DominatorTree. This allows us to visit definitions before
@@ -909,6 +944,16 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
909944
continue;
910945
}
911946

947+
if (isa<AllocaInst>(&I) &&
948+
SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) &&
949+
canRewriteUsesOfAlloca(cast<AllocaInst>(I))) {
950+
hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
951+
MSSAU, SE, ORE);
952+
HoistedInstructions.push_back(&I);
953+
Changed = true;
954+
continue;
955+
}
956+
912957
if (PHINode *PN = dyn_cast<PHINode>(&I)) {
913958
if (CFH.canHoistPHI(PN)) {
914959
// Redirect incoming blocks first to ensure that we create hoisted
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -S -licm < %s | FileCheck %s
3+
4+
@G = external global i64
5+
6+
define void @test(i64 %n) {
7+
; CHECK-LABEL: @test(
8+
; CHECK-NEXT: entry:
9+
; CHECK-NEXT: [[A:%.*]] = alloca i64
10+
; CHECK-NEXT: [[VAL:%.*]] = load i64, i64* [[A]]
11+
; CHECK-NEXT: store i64 [[VAL]], i64* @G
12+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
13+
; CHECK: for.body:
14+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
15+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
16+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
17+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
18+
; CHECK: exit:
19+
; CHECK-NEXT: ret void
20+
;
21+
entry:
22+
br label %for.body
23+
24+
for.body:
25+
%iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
26+
%a = alloca i64
27+
%val = load i64, i64* %a
28+
store i64 %val, i64* @G
29+
%iv.next = add nuw nsw i64 %iv, 1
30+
%exitcond = icmp ult i64 %iv, %n
31+
br i1 %exitcond, label %for.body, label %exit
32+
exit:
33+
ret void
34+
}
35+
36+
define void @test2(i64 %n) {
37+
; CHECK-LABEL: @test2(
38+
; CHECK-NEXT: entry:
39+
; CHECK-NEXT: [[A:%.*]] = alloca i64
40+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
41+
; CHECK: for.body:
42+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
43+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
44+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
45+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
46+
; CHECK: exit:
47+
; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], [[FOR_BODY]] ]
48+
; CHECK-NEXT: store i64 [[IV_LCSSA]], i64* [[A]], align 4
49+
; CHECK-NEXT: ret void
50+
;
51+
entry:
52+
br label %for.body
53+
54+
for.body:
55+
%iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
56+
%a = alloca i64
57+
store i64 %iv, i64* %a
58+
%iv.next = add nuw nsw i64 %iv, 1
59+
%exitcond = icmp ult i64 %iv, %n
60+
br i1 %exitcond, label %for.body, label %exit
61+
exit:
62+
ret void
63+
}
64+
65+
66+
define void @test3(i64 %n) {
67+
; CHECK-LABEL: @test3(
68+
; CHECK-NEXT: entry:
69+
; CHECK-NEXT: [[A:%.*]] = alloca i64
70+
; CHECK-NEXT: [[A_I8:%.*]] = bitcast i64* [[A]] to i8*
71+
; CHECK-NEXT: [[A_OFFSET:%.*]] = getelementptr i8, i8* [[A_I8]], i64 4
72+
; CHECK-NEXT: store i8 0, i8* [[A_OFFSET]]
73+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
74+
; CHECK: for.body:
75+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
76+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
77+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
78+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
79+
; CHECK: exit:
80+
; CHECK-NEXT: ret void
81+
;
82+
entry:
83+
br label %for.body
84+
85+
for.body:
86+
%iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
87+
%a = alloca i64
88+
%a.i8 = bitcast i64* %a to i8*
89+
%a.offset = getelementptr i8, i8* %a.i8, i64 4
90+
store i8 0, i8* %a.offset
91+
%iv.next = add nuw nsw i64 %iv, 1
92+
%exitcond = icmp ult i64 %iv, %n
93+
br i1 %exitcond, label %for.body, label %exit
94+
exit:
95+
ret void
96+
}
97+
98+
; This example is subtle. Because the dynamic alloca isn't reclaimed until
99+
; end of function scope, the captured value can legally point to a dynamic
100+
; alloca stack region from a previous iteration.
101+
define void @test4(i64 %n) {
102+
; CHECK-LABEL: @test4(
103+
; CHECK-NEXT: entry:
104+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
105+
; CHECK: for.body:
106+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
107+
; CHECK-NEXT: [[A:%.*]] = alloca i64
108+
; CHECK-NEXT: store i64 [[IV]], i64* [[A]]
109+
; CHECK-NEXT: call void @capture(i64* [[A]])
110+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
111+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
112+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
113+
; CHECK: exit:
114+
; CHECK-NEXT: ret void
115+
;
116+
entry:
117+
br label %for.body
118+
119+
for.body:
120+
%iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
121+
%a = alloca i64
122+
store i64 %iv, i64* %a
123+
%a.i8 = bitcast i64* %a to i8*
124+
call void @capture(i64* %a)
125+
%iv.next = add nuw nsw i64 %iv, 1
126+
%exitcond = icmp ult i64 %iv, %n
127+
br i1 %exitcond, label %for.body, label %exit
128+
exit:
129+
ret void
130+
}
131+
declare void @capture(i64* %a)
132+
133+
134+
; TODO: not yet handled
135+
define void @test5(i64 %n) {
136+
; CHECK-LABEL: @test5(
137+
; CHECK-NEXT: entry:
138+
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
139+
; CHECK: for.body:
140+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
141+
; CHECK-NEXT: [[A:%.*]] = alloca i64
142+
; CHECK-NEXT: store i64 [[IV]], i64* [[A]]
143+
; CHECK-NEXT: [[A_I8:%.*]] = bitcast i64* [[A]] to i8*
144+
; CHECK-NEXT: [[TMP0:%.*]] = call {}* @llvm.invariant.start.p0i8(i64 8, i8* [[A_I8]])
145+
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
146+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
147+
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT:%.*]]
148+
; CHECK: exit:
149+
; CHECK-NEXT: ret void
150+
;
151+
entry:
152+
br label %for.body
153+
154+
for.body:
155+
%iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
156+
%a = alloca i64
157+
store i64 %iv, i64* %a
158+
%a.i8 = bitcast i64* %a to i8*
159+
call {}* @llvm.invariant.start.p0i8(i64 8, i8* %a.i8)
160+
%iv.next = add nuw nsw i64 %iv, 1
161+
%exitcond = icmp ult i64 %iv, %n
162+
br i1 %exitcond, label %for.body, label %exit
163+
exit:
164+
ret void
165+
}
166+
167+
declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
168+

0 commit comments

Comments
 (0)