Skip to content

Commit 7b7ac4b

Browse files
committed
[X86] Expose memory codegen in element insert load tests to improve accuracy of checks
Also replace X32 with X86 check prefixes for i686 tests (we tend to try to use X32 for gnux32 targets)
1 parent a1c892b commit 7b7ac4b

File tree

2 files changed

+49
-57
lines changed

2 files changed

+49
-57
lines changed

llvm/test/CodeGen/X86/avx.ll

Lines changed: 48 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X32
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2+
; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X86
33
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X64
44

55
define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
@@ -43,16 +43,15 @@ define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x
4343
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
4444

4545
define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
46-
; On X32, account for the argument's move to registers
47-
; X32-LABEL: insertps_from_vector_load:
48-
; X32: ## %bb.0:
49-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
50-
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
51-
; X32-NEXT: retl
46+
; X86-LABEL: insertps_from_vector_load:
47+
; X86: ## %bb.0:
48+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
49+
; X86-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
50+
; X86-NEXT: retl
5251
;
5352
; X64-LABEL: insertps_from_vector_load:
5453
; X64: ## %bb.0:
55-
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
54+
; X64-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
5655
; X64-NEXT: retq
5756
%1 = load <4 x float>, <4 x float>* %pb, align 16
5857
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -61,38 +60,34 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
6160

6261
;; Use a non-zero CountS for insertps
6362
define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
64-
; On X32, account for the argument's move to registers
65-
;; Try to match a bit more of the instr, since we need the load's offset.
66-
; X32-LABEL: insertps_from_vector_load_offset:
67-
; X32: ## %bb.0:
68-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
69-
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
70-
; X32-NEXT: retl
63+
; X86-LABEL: insertps_from_vector_load_offset:
64+
; X86: ## %bb.0:
65+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
66+
; X86-NEXT: vinsertps $32, 4(%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
67+
; X86-NEXT: retl
7168
;
7269
; X64-LABEL: insertps_from_vector_load_offset:
7370
; X64: ## %bb.0:
74-
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
71+
; X64-NEXT: vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
7572
; X64-NEXT: retq
7673
%1 = load <4 x float>, <4 x float>* %pb, align 16
7774
%2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
7875
ret <4 x float> %2
7976
}
8077

8178
define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
82-
; On X32, account for the argument's move to registers
83-
;; Try to match a bit more of the instr, since we need the load's offset.
84-
; X32-LABEL: insertps_from_vector_load_offset_2:
85-
; X32: ## %bb.0:
86-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
87-
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
88-
; X32-NEXT: shll $4, %ecx
89-
; X32-NEXT: vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
90-
; X32-NEXT: retl
79+
; X86-LABEL: insertps_from_vector_load_offset_2:
80+
; X86: ## %bb.0:
81+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
82+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
83+
; X86-NEXT: shll $4, %ecx
84+
; X86-NEXT: vinsertps $0, 12(%eax,%ecx), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
85+
; X86-NEXT: retl
9186
;
9287
; X64-LABEL: insertps_from_vector_load_offset_2:
9388
; X64: ## %bb.0:
9489
; X64-NEXT: shlq $4, %rsi
95-
; X64-NEXT: vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
90+
; X64-NEXT: vinsertps $0, 12(%rdi,%rsi), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
9691
; X64-NEXT: retq
9792
%1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
9893
%2 = load <4 x float>, <4 x float>* %1, align 16
@@ -101,17 +96,16 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
10196
}
10297

10398
define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
104-
; On X32, account for the arguments' move to registers
105-
; X32-LABEL: insertps_from_broadcast_loadf32:
106-
; X32: ## %bb.0:
107-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
108-
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
109-
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
110-
; X32-NEXT: retl
99+
; X86-LABEL: insertps_from_broadcast_loadf32:
100+
; X86: ## %bb.0:
101+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
102+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
103+
; X86-NEXT: vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
104+
; X86-NEXT: retl
111105
;
112106
; X64-LABEL: insertps_from_broadcast_loadf32:
113107
; X64: ## %bb.0:
114-
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
108+
; X64-NEXT: vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
115109
; X64-NEXT: retq
116110
%1 = getelementptr inbounds float, float* %fb, i64 %index
117111
%2 = load float, float* %1, align 4
@@ -124,16 +118,15 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
124118
}
125119

126120
define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
127-
; On X32, account for the arguments' move to registers
128-
; X32-LABEL: insertps_from_broadcast_loadv4f32:
129-
; X32: ## %bb.0:
130-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
131-
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
132-
; X32-NEXT: retl
121+
; X86-LABEL: insertps_from_broadcast_loadv4f32:
122+
; X86: ## %bb.0:
123+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
124+
; X86-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
125+
; X86-NEXT: retl
133126
;
134127
; X64-LABEL: insertps_from_broadcast_loadv4f32:
135128
; X64: ## %bb.0:
136-
; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
129+
; X64-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
137130
; X64-NEXT: retq
138131
%1 = load <4 x float>, <4 x float>* %b, align 4
139132
%2 = extractelement <4 x float> %1, i32 0
@@ -147,20 +140,19 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
147140

148141
;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
149142
define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
150-
; On X32, account for the arguments' move to registers
151-
; X32-LABEL: insertps_from_broadcast_multiple_use:
152-
; X32: ## %bb.0:
153-
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
154-
; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
155-
; X32-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
156-
; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
157-
; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
158-
; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
159-
; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
160-
; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
161-
; X32-NEXT: vaddps %xmm2, %xmm1, %xmm1
162-
; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
163-
; X32-NEXT: retl
143+
; X86-LABEL: insertps_from_broadcast_multiple_use:
144+
; X86: ## %bb.0:
145+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
146+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
147+
; X86-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
148+
; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
149+
; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
150+
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
151+
; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
152+
; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
153+
; X86-NEXT: vaddps %xmm2, %xmm1, %xmm1
154+
; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
155+
; X86-NEXT: retl
164156
;
165157
; X64-LABEL: insertps_from_broadcast_multiple_use:
166158
; X64: ## %bb.0:

llvm/test/CodeGen/X86/sse41.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
22
; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+sse4.1 -show-mc-encoding | FileCheck %s --check-prefixes=SSE,X86-SSE
33
; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX1,X86-AVX1
44
; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl -show-mc-encoding | FileCheck %s --check-prefixes=AVX,AVX512,X86-AVX512

0 commit comments

Comments
 (0)