1
+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
1
2
; RUN: llc -opaque-pointers=0 -O3 -mtriple=thumb-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9
2
3
3
4
; @simple is the most basic chain of address induction variables. Chaining
4
5
; saves at least one register and avoids complex addressing and setup
5
6
; code.
6
7
;
7
- ; A9: @simple
8
8
; no expensive address computation in the preheader
9
- ; A9: lsl
10
- ; A9-NOT: lsl
11
- ; A9: %loop
12
9
; no complex address modes
13
- ; A9-NOT: lsl
14
10
define i32 @simple (i32* %a , i32* %b , i32 %x ) nounwind {
11
+ ; A9-LABEL: simple:
12
+ ; A9: @ %bb.0: @ %entry
13
+ ; A9-NEXT: .save {r4, r5, r6, lr}
14
+ ; A9-NEXT: push {r4, r5, r6, lr}
15
+ ; A9-NEXT: mov r3, r0
16
+ ; A9-NEXT: lsls r2, r2, #2
17
+ ; A9-NEXT: movs r0, #0
18
+ ; A9-NEXT: .LBB0_1: @ %loop
19
+ ; A9-NEXT: @ =>This Inner Loop Header: Depth=1
20
+ ; A9-NEXT: add.w lr, r3, r2
21
+ ; A9-NEXT: ldr.w r12, [r3, r2]
22
+ ; A9-NEXT: ldr r3, [r3]
23
+ ; A9-NEXT: add.w r4, lr, r2
24
+ ; A9-NEXT: ldr.w r6, [lr, r2]
25
+ ; A9-NEXT: add r0, r3
26
+ ; A9-NEXT: adds r3, r4, r2
27
+ ; A9-NEXT: add r0, r12
28
+ ; A9-NEXT: ldr r5, [r4, r2]
29
+ ; A9-NEXT: add r0, r6
30
+ ; A9-NEXT: add r3, r2
31
+ ; A9-NEXT: add r0, r5
32
+ ; A9-NEXT: cmp r3, r1
33
+ ; A9-NEXT: bne .LBB0_1
34
+ ; A9-NEXT: @ %bb.2: @ %exit
35
+ ; A9-NEXT: pop {r4, r5, r6, pc}
15
36
entry:
16
37
br label %loop
17
38
loop:
@@ -37,15 +58,34 @@ exit:
37
58
38
59
; @user is not currently chained because the IV is live across memory ops.
39
60
;
40
- ; A9: @user
41
61
; stride multiples computed in the preheader
42
- ; A9: lsl
43
- ; A9: lsl
44
- ; A9: %loop
45
62
; complex address modes
46
- ; A9: lsl
47
- ; A9: lsl
48
63
define i32 @user (i32* %a , i32* %b , i32 %x ) nounwind {
64
+ ; A9-LABEL: user:
65
+ ; A9: @ %bb.0: @ %entry
66
+ ; A9-NEXT: .save {r4, r5, r6, r7, lr}
67
+ ; A9-NEXT: push {r4, r5, r6, r7, lr}
68
+ ; A9-NEXT: add.w r3, r2, r2, lsl #1
69
+ ; A9-NEXT: lsl.w r12, r2, #4
70
+ ; A9-NEXT: lsl.w lr, r3, #2
71
+ ; A9-NEXT: movs r3, #0
72
+ ; A9-NEXT: .LBB1_1: @ %loop
73
+ ; A9-NEXT: @ =>This Inner Loop Header: Depth=1
74
+ ; A9-NEXT: ldr r4, [r0]
75
+ ; A9-NEXT: ldr.w r5, [r0, r2, lsl #3]
76
+ ; A9-NEXT: ldr.w r6, [r0, r2, lsl #2]
77
+ ; A9-NEXT: add r3, r4
78
+ ; A9-NEXT: ldr.w r7, [r0, lr]
79
+ ; A9-NEXT: add r3, r6
80
+ ; A9-NEXT: add r3, r5
81
+ ; A9-NEXT: add r3, r7
82
+ ; A9-NEXT: str r3, [r0]
83
+ ; A9-NEXT: add r0, r12
84
+ ; A9-NEXT: cmp r0, r1
85
+ ; A9-NEXT: bne .LBB1_1
86
+ ; A9-NEXT: @ %bb.2: @ %exit
87
+ ; A9-NEXT: mov r0, r3
88
+ ; A9-NEXT: pop {r4, r5, r6, r7, pc}
49
89
entry:
50
90
br label %loop
51
91
loop:
@@ -75,16 +115,43 @@ exit:
75
115
; used to do, and exactly what we don't want to do. LSR's new IV
76
116
; chaining feature should now undo the damage.
77
117
;
78
- ; A9: extrastride:
79
118
; no spills
80
- ; A9-NOT: str
81
119
; only one stride multiple in the preheader
82
- ; A9: lsl
83
- ; A9-NOT: {{str r|lsl}}
84
- ; A9: %for.body{{$}}
85
120
; no complex address modes or reloads
86
- ; A9-NOT: {{ldr .*[sp]|lsl}}
87
121
define void @extrastride (i8* nocapture %main , i32 %main_stride , i32* nocapture %res , i32 %x , i32 %y , i32 %z ) nounwind {
122
+ ; A9-LABEL: extrastride:
123
+ ; A9: @ %bb.0: @ %entry
124
+ ; A9-NEXT: .save {r4, r5, r6, r7, lr}
125
+ ; A9-NEXT: push {r4, r5, r6, r7, lr}
126
+ ; A9-NEXT: ldr.w r12, [sp, #24]
127
+ ; A9-NEXT: cmp.w r12, #0
128
+ ; A9-NEXT: beq .LBB2_3
129
+ ; A9-NEXT: @ %bb.1: @ %for.body.lr.ph
130
+ ; A9-NEXT: ldr r4, [sp, #20]
131
+ ; A9-NEXT: add.w lr, r3, r1
132
+ ; A9-NEXT: lsls r3, r4, #2
133
+ ; A9-NEXT: .LBB2_2: @ %for.body
134
+ ; A9-NEXT: @ =>This Inner Loop Header: Depth=1
135
+ ; A9-NEXT: adds r5, r0, r1
136
+ ; A9-NEXT: ldr r4, [r0, r1]
137
+ ; A9-NEXT: ldr r0, [r0]
138
+ ; A9-NEXT: subs.w r12, r12, #1
139
+ ; A9-NEXT: ldr r6, [r5, r1]
140
+ ; A9-NEXT: add r5, r1
141
+ ; A9-NEXT: add r0, r4
142
+ ; A9-NEXT: ldr r7, [r5, r1]
143
+ ; A9-NEXT: add r5, r1
144
+ ; A9-NEXT: add r0, r6
145
+ ; A9-NEXT: ldr r4, [r5, r1]
146
+ ; A9-NEXT: add r0, r7
147
+ ; A9-NEXT: add r0, r4
148
+ ; A9-NEXT: str r0, [r2]
149
+ ; A9-NEXT: add.w r0, r5, r1
150
+ ; A9-NEXT: add r2, r3
151
+ ; A9-NEXT: add r0, lr
152
+ ; A9-NEXT: bne .LBB2_2
153
+ ; A9-NEXT: .LBB2_3: @ %for.end
154
+ ; A9-NEXT: pop {r4, r5, r6, r7, pc}
88
155
entry:
89
156
%cmp8 = icmp eq i32 %z , 0
90
157
br i1 %cmp8 , label %for.end , label %for.body.lr.ph
@@ -136,10 +203,38 @@ for.end: ; preds = %for.body, %entry
136
203
; }
137
204
; where 's' can be folded into the addressing mode.
138
205
; Consequently, we should *not* form any chains.
139
- ;
140
- ; A9: foldedidx:
141
- ; A9: ldrb{{(.w)?}} {{r[0-9]|lr}}, [{{r[0-9]|lr}}, #3]
142
206
define void @foldedidx (i8* nocapture %a , i8* nocapture %b , i8* nocapture %c ) nounwind ssp {
207
+ ; A9-LABEL: foldedidx:
208
+ ; A9: @ %bb.0: @ %entry
209
+ ; A9-NEXT: .save {r4, r5, r6, lr}
210
+ ; A9-NEXT: push {r4, r5, r6, lr}
211
+ ; A9-NEXT: mov.w lr, #0
212
+ ; A9-NEXT: .LBB3_1: @ %for.body
213
+ ; A9-NEXT: @ =>This Inner Loop Header: Depth=1
214
+ ; A9-NEXT: ldrb.w r12, [r0, lr]
215
+ ; A9-NEXT: add.w r4, r1, lr
216
+ ; A9-NEXT: ldrb.w r3, [r1, lr]
217
+ ; A9-NEXT: add r3, r12
218
+ ; A9-NEXT: strb.w r3, [r2, lr]
219
+ ; A9-NEXT: add.w r3, r0, lr
220
+ ; A9-NEXT: ldrb.w r12, [r3, #1]
221
+ ; A9-NEXT: ldrb r5, [r4, #1]
222
+ ; A9-NEXT: add r12, r5
223
+ ; A9-NEXT: add.w r5, r2, lr
224
+ ; A9-NEXT: strb.w r12, [r5, #1]
225
+ ; A9-NEXT: add.w lr, lr, #4
226
+ ; A9-NEXT: cmp.w lr, #400
227
+ ; A9-NEXT: ldrb.w r12, [r3, #2]
228
+ ; A9-NEXT: ldrb r6, [r4, #2]
229
+ ; A9-NEXT: add r6, r12
230
+ ; A9-NEXT: strb r6, [r5, #2]
231
+ ; A9-NEXT: ldrb r3, [r3, #3]
232
+ ; A9-NEXT: ldrb r6, [r4, #3]
233
+ ; A9-NEXT: add r3, r6
234
+ ; A9-NEXT: strb r3, [r5, #3]
235
+ ; A9-NEXT: bne .LBB3_1
236
+ ; A9-NEXT: @ %bb.2: @ %for.end
237
+ ; A9-NEXT: pop {r4, r5, r6, pc}
143
238
entry:
144
239
br label %for.body
145
240
@@ -200,14 +295,45 @@ for.end: ; preds = %for.body
200
295
;
201
296
; Loads and stores should use post-increment addressing, no add's or add.w's.
202
297
; Most importantly, there should be no spills or reloads!
203
- ;
204
- ; A9: testNeon:
205
- ; A9: %.lr.ph
206
- ; A9-NOT: lsl.w
207
- ; A9-NOT: {{ldr|str|adds|add r}}
208
- ; A9-NOT: add.w r
209
- ; A9: bne
210
298
define hidden void @testNeon (i8* %ref_data , i32 %ref_stride , i32 %limit , <16 x i8 >* nocapture %data ) nounwind optsize {
299
+ ; A9-LABEL: testNeon:
300
+ ; A9: @ %bb.0:
301
+ ; A9-NEXT: .save {r4, r5, r7, lr}
302
+ ; A9-NEXT: push {r4, r5, r7, lr}
303
+ ; A9-NEXT: vmov.i32 q8, #0x0
304
+ ; A9-NEXT: cmp r2, #1
305
+ ; A9-NEXT: blt .LBB4_4
306
+ ; A9-NEXT: @ %bb.1: @ %.lr.ph
307
+ ; A9-NEXT: movs r5, #0
308
+ ; A9-NEXT: movw r4, #64464
309
+ ; A9-NEXT: sub.w r12, r5, r2, lsl #6
310
+ ; A9-NEXT: sub.w lr, r1, r1, lsl #4
311
+ ; A9-NEXT: movt r4, #65535
312
+ ; A9-NEXT: mov r5, r3
313
+ ; A9-NEXT: .LBB4_2: @ =>This Inner Loop Header: Depth=1
314
+ ; A9-NEXT: vld1.64 {d18}, [r0], r1
315
+ ; A9-NEXT: subs r2, #1
316
+ ; A9-NEXT: vld1.64 {d19}, [r0], r1
317
+ ; A9-NEXT: vst1.8 {d18, d19}, [r5]!
318
+ ; A9-NEXT: vld1.64 {d20}, [r0], r1
319
+ ; A9-NEXT: vld1.64 {d21}, [r0], r1
320
+ ; A9-NEXT: vst1.8 {d20, d21}, [r5]!
321
+ ; A9-NEXT: vld1.64 {d22}, [r0], r1
322
+ ; A9-NEXT: vadd.i8 q9, q9, q10
323
+ ; A9-NEXT: vld1.64 {d23}, [r0], r1
324
+ ; A9-NEXT: vst1.8 {d22, d23}, [r5]!
325
+ ; A9-NEXT: vld1.64 {d20}, [r0], r1
326
+ ; A9-NEXT: vadd.i8 q9, q9, q11
327
+ ; A9-NEXT: vld1.64 {d21}, [r0], lr
328
+ ; A9-NEXT: vadd.i8 q9, q9, q10
329
+ ; A9-NEXT: vadd.i8 q8, q8, q9
330
+ ; A9-NEXT: vst1.8 {d20, d21}, [r5], r4
331
+ ; A9-NEXT: bne .LBB4_2
332
+ ; A9-NEXT: @ %bb.3: @ %._crit_edge
333
+ ; A9-NEXT: add.w r3, r3, r12, lsl #4
334
+ ; A9-NEXT: .LBB4_4:
335
+ ; A9-NEXT: vst1.32 {d16, d17}, [r3]
336
+ ; A9-NEXT: pop {r4, r5, r7, pc}
211
337
%1 = icmp sgt i32 %limit , 0
212
338
br i1 %1 , label %.lr.ph , label %45
213
339
@@ -284,24 +410,41 @@ declare <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8*, i32) nounwind readonly
284
410
; Handle chains in which the same offset is used for both loads and
285
411
; stores to the same array.
286
412
; rdar://11410078.
287
- ;
288
- ; A9: @testReuse
289
- ; A9: %for.body
290
- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE:[r[0-9]+]]], [[INC:r[0-9]]]
291
- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
292
- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
293
- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
294
- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
295
- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
296
- ; A9: vld1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
297
- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
298
- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
299
- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
300
- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
301
- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]], [[INC]]
302
- ; A9: vst1.8 {d{{[0-9]+}}}, [[BASE]]
303
- ; A9: bne
304
413
define void @testReuse (i8* %src , i32 %stride ) nounwind ssp {
414
+ ; A9-LABEL: testReuse:
415
+ ; A9: @ %bb.0: @ %entry
416
+ ; A9-NEXT: sub.w r12, r0, r1, lsl #2
417
+ ; A9-NEXT: sub.w r0, r1, r1, lsl #2
418
+ ; A9-NEXT: lsls r2, r0, #1
419
+ ; A9-NEXT: movs r3, #0
420
+ ; A9-NEXT: .LBB5_1: @ %for.body
421
+ ; A9-NEXT: @ =>This Inner Loop Header: Depth=1
422
+ ; A9-NEXT: add.w r0, r12, r3
423
+ ; A9-NEXT: adds r3, #8
424
+ ; A9-NEXT: vld1.8 {d16}, [r0], r1
425
+ ; A9-NEXT: cmp r3, #32
426
+ ; A9-NEXT: vld1.8 {d17}, [r0], r1
427
+ ; A9-NEXT: vhadd.u8 d16, d16, d17
428
+ ; A9-NEXT: vld1.8 {d18}, [r0], r1
429
+ ; A9-NEXT: vhadd.u8 d17, d17, d18
430
+ ; A9-NEXT: vld1.8 {d19}, [r0], r1
431
+ ; A9-NEXT: vhadd.u8 d18, d18, d19
432
+ ; A9-NEXT: vld1.8 {d20}, [r0], r1
433
+ ; A9-NEXT: vhadd.u8 d19, d19, d20
434
+ ; A9-NEXT: vld1.8 {d21}, [r0], r1
435
+ ; A9-NEXT: vhadd.u8 d20, d20, d21
436
+ ; A9-NEXT: vld1.8 {d22}, [r0], r1
437
+ ; A9-NEXT: vhadd.u8 d21, d21, d22
438
+ ; A9-NEXT: vld1.8 {d23}, [r0], r2
439
+ ; A9-NEXT: vst1.8 {d16}, [r0], r1
440
+ ; A9-NEXT: vst1.8 {d17}, [r0], r1
441
+ ; A9-NEXT: vst1.8 {d18}, [r0], r1
442
+ ; A9-NEXT: vst1.8 {d19}, [r0], r1
443
+ ; A9-NEXT: vst1.8 {d20}, [r0], r1
444
+ ; A9-NEXT: vst1.8 {d21}, [r0]
445
+ ; A9-NEXT: bne .LBB5_1
446
+ ; A9-NEXT: @ %bb.2: @ %for.end
447
+ ; A9-NEXT: bx lr
305
448
entry:
306
449
%mul = shl nsw i32 %stride , 2
307
450
%idx.neg = sub i32 0 , %mul
0 commit comments