Skip to content

Commit 94b07ab

Browse files
tlivelyalexcrichton
authored andcommitted
[WebAssembly] Restore defaults for stores per memop
Summary: Large slowdowns were observed in Rust due to many small, constant sized copies in conjunction with poorly-optimized memory.copy implementations. Since memory.copy cannot be expected to be inlined efficiently by engines at this time, stop using it for the smallest copies. We continue to lower all memcpy intrinsics to memory.copy, though. Reviewers: aheejin, alexcrichton Subscribers: dschuff, sbc100, jgravelle-google, hiraditya, JDevlieghere, sunfish, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67639 llvm-svn: 372275
1 parent 8adf9bd commit 94b07ab

File tree

2 files changed

+20
-30
lines changed

2 files changed

+20
-30
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -259,16 +259,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
259259

260260
setMaxAtomicSizeInBitsSupported(64);
261261

262-
if (Subtarget->hasBulkMemory()) {
263-
// Use memory.copy and friends over multiple loads and stores
264-
MaxStoresPerMemcpy = 1;
265-
MaxStoresPerMemcpyOptSize = 1;
266-
MaxStoresPerMemmove = 1;
267-
MaxStoresPerMemmoveOptSize = 1;
268-
MaxStoresPerMemset = 1;
269-
MaxStoresPerMemsetOptSize = 1;
270-
}
271-
272262
// Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is
273263
// consistent with the f64 and f128 names.
274264
setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");

llvm/test/CodeGen/WebAssembly/bulk-memory.ll

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,8 @@ define void @memset_1024(i8* %dest, i8 %val) {
142142
}
143143

144144
; The following tests check that frame index elimination works for
145-
; bulk memory instructions. The stack pointer is bumped by 16 instead
146-
; of 10 because the stack pointer in WebAssembly is currently always
145+
; bulk memory instructions. The stack pointer is bumped by 112 instead
146+
; of 100 because the stack pointer in WebAssembly is currently always
147147
; 16-byte aligned, even in leaf functions, although it is not written
148148
; back to the global in this case.
149149

@@ -156,52 +156,52 @@ define void @memset_1024(i8* %dest, i8 %val) {
156156
; NO-BULK-MEM-NOT: memory.copy
157157
; BULK-MEM-NEXT: .functype memcpy_alloca_src (i32) -> ()
158158
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
159-
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 16
159+
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
160160
; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
161-
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 6
161+
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
162162
; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
163-
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 10
163+
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
164164
; BULK-MEM-NEXT: memory.copy 0, 0, $0, $pop[[L4]], $pop[[L5]]
165165
; BULK-MEM-NEXT: return
166166
define void @memcpy_alloca_src(i8* %dst) {
167-
%a = alloca [10 x i8]
168-
%p = bitcast [10 x i8]* %a to i8*
169-
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %p, i32 10, i1 false)
167+
%a = alloca [100 x i8]
168+
%p = bitcast [100 x i8]* %a to i8*
169+
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %p, i32 100, i1 false)
170170
ret void
171171
}
172172

173173
; CHECK-LABEL: memcpy_alloca_dst:
174174
; NO-BULK-MEM-NOT: memory.copy
175175
; BULK-MEM-NEXT: .functype memcpy_alloca_dst (i32) -> ()
176176
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
177-
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 16
177+
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
178178
; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
179-
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 6
179+
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
180180
; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
181-
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 10
181+
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
182182
; BULK-MEM-NEXT: memory.copy 0, 0, $pop[[L4]], $0, $pop[[L5]]
183183
; BULK-MEM-NEXT: return
184184
define void @memcpy_alloca_dst(i8* %src) {
185-
%a = alloca [10 x i8]
186-
%p = bitcast [10 x i8]* %a to i8*
187-
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %src, i32 10, i1 false)
185+
%a = alloca [100 x i8]
186+
%p = bitcast [100 x i8]* %a to i8*
187+
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %p, i8* %src, i32 100, i1 false)
188188
ret void
189189
}
190190

191191
; CHECK-LABEL: memset_alloca:
192192
; NO-BULK-MEM-NOT: memory.fill
193193
; BULK-MEM-NEXT: .functype memset_alloca (i32) -> ()
194194
; BULK-MEM-NEXT: global.get $push[[L0:[0-9]+]]=, __stack_pointer
195-
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 16
195+
; BULK-MEM-NEXT: i32.const $push[[L1:[0-9]+]]=, 112
196196
; BULK-MEM-NEXT: i32.sub $push[[L2:[0-9]+]]=, $pop[[L0]], $pop[[L1]]
197-
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 6
197+
; BULK-MEM-NEXT: i32.const $push[[L3:[0-9]+]]=, 12
198198
; BULK-MEM-NEXT: i32.add $push[[L4:[0-9]+]]=, $pop[[L2]], $pop[[L3]]
199-
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 10
199+
; BULK-MEM-NEXT: i32.const $push[[L5:[0-9]+]]=, 100
200200
; BULK-MEM-NEXT: memory.fill 0, $pop[[L4]], $0, $pop[[L5]]
201201
; BULK-MEM-NEXT: return
202202
define void @memset_alloca(i8 %val) {
203-
%a = alloca [10 x i8]
204-
%p = bitcast [10 x i8]* %a to i8*
205-
call void @llvm.memset.p0i8.i32(i8* %p, i8 %val, i32 10, i1 false)
203+
%a = alloca [100 x i8]
204+
%p = bitcast [100 x i8]* %a to i8*
205+
call void @llvm.memset.p0i8.i32(i8* %p, i8 %val, i32 100, i1 false)
206206
ret void
207207
}

0 commit comments

Comments
 (0)