1
- ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2
- ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X32
1
+ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
2
+ ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X86
3
3
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X64
4
4
5
5
define <4 x i32 > @blendvb_fallback_v4i32 (<4 x i1 > %mask , <4 x i32 > %x , <4 x i32 > %y ) {
@@ -43,16 +43,15 @@ define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x
43
43
declare <4 x float > @llvm.x86.sse41.insertps (<4 x float >, <4 x float >, i32 ) nounwind readnone
44
44
45
45
define <4 x float > @insertps_from_vector_load (<4 x float > %a , <4 x float >* nocapture readonly %pb ) {
46
- ; On X32, account for the argument's move to registers
47
- ; X32-LABEL: insertps_from_vector_load:
48
- ; X32: ## %bb.0:
49
- ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
50
- ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
51
- ; X32-NEXT: retl
46
+ ; X86-LABEL: insertps_from_vector_load:
47
+ ; X86: ## %bb.0:
48
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
49
+ ; X86-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
50
+ ; X86-NEXT: retl
52
51
;
53
52
; X64-LABEL: insertps_from_vector_load:
54
53
; X64: ## %bb.0:
55
- ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
54
+ ; X64-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
56
55
; X64-NEXT: retq
57
56
%1 = load <4 x float >, <4 x float >* %pb , align 16
58
57
%2 = tail call <4 x float > @llvm.x86.sse41.insertps (<4 x float > %a , <4 x float > %1 , i32 48 )
@@ -61,38 +60,34 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
61
60
62
61
;; Use a non-zero CountS for insertps
63
62
define <4 x float > @insertps_from_vector_load_offset (<4 x float > %a , <4 x float >* nocapture readonly %pb ) {
64
- ; On X32, account for the argument's move to registers
65
- ;; Try to match a bit more of the instr, since we need the load's offset.
66
- ; X32-LABEL: insertps_from_vector_load_offset:
67
- ; X32: ## %bb.0:
68
- ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
69
- ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
70
- ; X32-NEXT: retl
63
+ ; X86-LABEL: insertps_from_vector_load_offset:
64
+ ; X86: ## %bb.0:
65
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
66
+ ; X86-NEXT: vinsertps $32, 4(%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
67
+ ; X86-NEXT: retl
71
68
;
72
69
; X64-LABEL: insertps_from_vector_load_offset:
73
70
; X64: ## %bb.0:
74
- ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
71
+ ; X64-NEXT: vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
75
72
; X64-NEXT: retq
76
73
%1 = load <4 x float >, <4 x float >* %pb , align 16
77
74
%2 = tail call <4 x float > @llvm.x86.sse41.insertps (<4 x float > %a , <4 x float > %1 , i32 96 )
78
75
ret <4 x float > %2
79
76
}
80
77
81
78
define <4 x float > @insertps_from_vector_load_offset_2 (<4 x float > %a , <4 x float >* nocapture readonly %pb , i64 %index ) {
82
- ; On X32, account for the argument's move to registers
83
- ;; Try to match a bit more of the instr, since we need the load's offset.
84
- ; X32-LABEL: insertps_from_vector_load_offset_2:
85
- ; X32: ## %bb.0:
86
- ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
87
- ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
88
- ; X32-NEXT: shll $4, %ecx
89
- ; X32-NEXT: vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
90
- ; X32-NEXT: retl
79
+ ; X86-LABEL: insertps_from_vector_load_offset_2:
80
+ ; X86: ## %bb.0:
81
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
82
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
83
+ ; X86-NEXT: shll $4, %ecx
84
+ ; X86-NEXT: vinsertps $0, 12(%eax,%ecx), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
85
+ ; X86-NEXT: retl
91
86
;
92
87
; X64-LABEL: insertps_from_vector_load_offset_2:
93
88
; X64: ## %bb.0:
94
89
; X64-NEXT: shlq $4, %rsi
95
- ; X64-NEXT: vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
90
+ ; X64-NEXT: vinsertps $0, 12(%rdi,%rsi), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
96
91
; X64-NEXT: retq
97
92
%1 = getelementptr inbounds <4 x float >, <4 x float >* %pb , i64 %index
98
93
%2 = load <4 x float >, <4 x float >* %1 , align 16
@@ -101,17 +96,16 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
101
96
}
102
97
103
98
define <4 x float > @insertps_from_broadcast_loadf32 (<4 x float > %a , float * nocapture readonly %fb , i64 %index ) {
104
- ; On X32, account for the arguments' move to registers
105
- ; X32-LABEL: insertps_from_broadcast_loadf32:
106
- ; X32: ## %bb.0:
107
- ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
108
- ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
109
- ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
110
- ; X32-NEXT: retl
99
+ ; X86-LABEL: insertps_from_broadcast_loadf32:
100
+ ; X86: ## %bb.0:
101
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
102
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
103
+ ; X86-NEXT: vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
104
+ ; X86-NEXT: retl
111
105
;
112
106
; X64-LABEL: insertps_from_broadcast_loadf32:
113
107
; X64: ## %bb.0:
114
- ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
108
+ ; X64-NEXT: vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
115
109
; X64-NEXT: retq
116
110
%1 = getelementptr inbounds float , float * %fb , i64 %index
117
111
%2 = load float , float * %1 , align 4
@@ -124,16 +118,15 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
124
118
}
125
119
126
120
define <4 x float > @insertps_from_broadcast_loadv4f32 (<4 x float > %a , <4 x float >* nocapture readonly %b ) {
127
- ; On X32, account for the arguments' move to registers
128
- ; X32-LABEL: insertps_from_broadcast_loadv4f32:
129
- ; X32: ## %bb.0:
130
- ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
131
- ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
132
- ; X32-NEXT: retl
121
+ ; X86-LABEL: insertps_from_broadcast_loadv4f32:
122
+ ; X86: ## %bb.0:
123
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
124
+ ; X86-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
125
+ ; X86-NEXT: retl
133
126
;
134
127
; X64-LABEL: insertps_from_broadcast_loadv4f32:
135
128
; X64: ## %bb.0:
136
- ; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
129
+ ; X64-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
137
130
; X64-NEXT: retq
138
131
%1 = load <4 x float >, <4 x float >* %b , align 4
139
132
%2 = extractelement <4 x float > %1 , i32 0
@@ -147,20 +140,19 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
147
140
148
141
;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
149
142
define <4 x float > @insertps_from_broadcast_multiple_use (<4 x float > %a , <4 x float > %b , <4 x float > %c , <4 x float > %d , float * nocapture readonly %fb , i64 %index ) {
150
- ; On X32, account for the arguments' move to registers
151
- ; X32-LABEL: insertps_from_broadcast_multiple_use:
152
- ; X32: ## %bb.0:
153
- ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
154
- ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
155
- ; X32-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
156
- ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
157
- ; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
158
- ; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
159
- ; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
160
- ; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
161
- ; X32-NEXT: vaddps %xmm2, %xmm1, %xmm1
162
- ; X32-NEXT: vaddps %xmm1, %xmm0, %xmm0
163
- ; X32-NEXT: retl
143
+ ; X86-LABEL: insertps_from_broadcast_multiple_use:
144
+ ; X86: ## %bb.0:
145
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
146
+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
147
+ ; X86-NEXT: vbroadcastss (%ecx,%eax,4), %xmm4
148
+ ; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
149
+ ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
150
+ ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
151
+ ; X86-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
152
+ ; X86-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
153
+ ; X86-NEXT: vaddps %xmm2, %xmm1, %xmm1
154
+ ; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0
155
+ ; X86-NEXT: retl
164
156
;
165
157
; X64-LABEL: insertps_from_broadcast_multiple_use:
166
158
; X64: ## %bb.0:
0 commit comments