Skip to content
This repository was archived by the owner on Nov 8, 2023. It is now read-only.

Commit 3975431

Browse files
enh-googleGerrit Code Review
authored andcommitted
Merge "Add avx2 version of wmemset in binoic"
2 parents 7e958d0 + 4ed2f47 commit 3975431

File tree

6 files changed

+319
-0
lines changed

6 files changed

+319
-0
lines changed

libc/Android.bp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ cc_library_static {
273273
"upstream-freebsd/lib/libc/string/wcscat.c",
274274
"upstream-freebsd/lib/libc/string/wcscpy.c",
275275
"upstream-freebsd/lib/libc/string/wmemcmp.c",
276+
"upstream-freebsd/lib/libc/string/wmemset.c",
276277
],
277278
},
278279
},
@@ -902,6 +903,7 @@ cc_library_static {
902903
"arch-x86/generic/string/wcscat.c",
903904
"arch-x86/generic/string/wcscpy.c",
904905
"arch-x86/generic/string/wmemcmp.c",
906+
"arch-x86/generic/string/wmemset.c",
905907

906908
"arch-x86/atom/string/sse2-memchr-atom.S",
907909
"arch-x86/atom/string/sse2-memrchr-atom.S",
@@ -950,6 +952,9 @@ cc_library_static {
950952
"arch-x86/atom/string/ssse3-strcpy-atom.S",
951953
"arch-x86/atom/string/ssse3-strncpy-atom.S",
952954
"arch-x86/atom/string/ssse3-wmemcmp-atom.S",
955+
956+
// avx2 functions
957+
"arch-x86/kabylake/string/avx2-wmemset-kbl.S",
953958
],
954959

955960
exclude_srcs: [
@@ -972,6 +977,7 @@ cc_library_static {
972977
"arch-x86_64/string/sse4-memcmp-slm.S",
973978
"arch-x86_64/string/ssse3-strcmp-slm.S",
974979
"arch-x86_64/string/ssse3-strncmp-slm.S",
980+
"arch-x86_64/string/avx2-wmemset-kbl.S",
975981

976982
"arch-x86_64/bionic/__bionic_clone.S",
977983
"arch-x86_64/bionic/_exit_with_stack_teardown.S",

libc/arch-x86/dynamic_function_dispatch.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,13 @@ DEFINE_IFUNC_FOR(wmemcmp) {
107107
RETURN_FUNC(wmemcmp_func, wmemcmp_freebsd);
108108
}
109109

110+
typedef int wmemset_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n);
111+
DEFINE_IFUNC_FOR(wmemset) {
112+
__builtin_cpu_init();
113+
if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wmemset_func, wmemset_avx2);
114+
RETURN_FUNC(wmemset_func, wmemset_freebsd);
115+
}
116+
110117
typedef int strcmp_func(const char* __lhs, const char* __rhs);
111118
DEFINE_IFUNC_FOR(strcmp) {
112119
__builtin_cpu_init();
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
// Copyright (C) 2019 The Android Open Source Project
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#define wmemset wmemset_freebsd
16+
17+
#include <upstream-freebsd/lib/libc/string/wmemset.c>
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
/*
2+
Copyright (C) 2019 The Android Open Source Project
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions
7+
are met:
8+
* Redistributions of source code must retain the above copyright
9+
notice, this list of conditions and the following disclaimer.
10+
* Redistributions in binary form must reproduce the above copyright
11+
notice, this list of conditions and the following disclaimer in
12+
the documentation and/or other materials provided with the
13+
distribution.
14+
15+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18+
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19+
COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20+
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21+
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22+
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23+
AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25+
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26+
SUCH DAMAGE.
27+
*/
28+
29+
#include <private/bionic_asm.h>
30+
31+
#ifndef WMEMSET
32+
#define WMEMSET wmemset_avx2
33+
#endif
34+
35+
ENTRY(WMEMSET)
36+
# BB#0:
37+
pushl %ebp
38+
pushl %ebx
39+
pushl %edi
40+
pushl %esi
41+
pushl %eax
42+
movl 32(%esp), %ecx
43+
movl 24(%esp), %eax
44+
testl %ecx, %ecx
45+
je .LBB0_12
46+
# BB#1:
47+
movl 28(%esp), %edx
48+
xorl %edi, %edi
49+
movl %eax, %esi
50+
cmpl $32, %ecx
51+
jb .LBB0_10
52+
# BB#2:
53+
movl %ecx, %eax
54+
andl $-32, %eax
55+
vmovd %edx, %xmm0
56+
vpbroadcastd %xmm0, %ymm0
57+
movl %eax, (%esp) # 4-byte Spill
58+
leal -32(%eax), %esi
59+
movl %esi, %eax
60+
shrl $5, %eax
61+
leal 1(%eax), %edi
62+
andl $7, %edi
63+
xorl %ebx, %ebx
64+
cmpl $224, %esi
65+
jb .LBB0_5
66+
# BB#3:
67+
movl 24(%esp), %esi
68+
leal 992(%esi), %ebp
69+
leal -1(%edi), %esi
70+
subl %eax, %esi
71+
xorl %ebx, %ebx
72+
.p2align 4, 0x90
73+
.LBB0_4: # =>This Inner Loop Header: Depth=1
74+
vmovdqu %ymm0, -992(%ebp,%ebx,4)
75+
vmovdqu %ymm0, -960(%ebp,%ebx,4)
76+
vmovdqu %ymm0, -928(%ebp,%ebx,4)
77+
vmovdqu %ymm0, -896(%ebp,%ebx,4)
78+
vmovdqu %ymm0, -864(%ebp,%ebx,4)
79+
vmovdqu %ymm0, -832(%ebp,%ebx,4)
80+
vmovdqu %ymm0, -800(%ebp,%ebx,4)
81+
vmovdqu %ymm0, -768(%ebp,%ebx,4)
82+
vmovdqu %ymm0, -736(%ebp,%ebx,4)
83+
vmovdqu %ymm0, -704(%ebp,%ebx,4)
84+
vmovdqu %ymm0, -672(%ebp,%ebx,4)
85+
vmovdqu %ymm0, -640(%ebp,%ebx,4)
86+
vmovdqu %ymm0, -608(%ebp,%ebx,4)
87+
vmovdqu %ymm0, -576(%ebp,%ebx,4)
88+
vmovdqu %ymm0, -544(%ebp,%ebx,4)
89+
vmovdqu %ymm0, -512(%ebp,%ebx,4)
90+
vmovdqu %ymm0, -480(%ebp,%ebx,4)
91+
vmovdqu %ymm0, -448(%ebp,%ebx,4)
92+
vmovdqu %ymm0, -416(%ebp,%ebx,4)
93+
vmovdqu %ymm0, -384(%ebp,%ebx,4)
94+
vmovdqu %ymm0, -352(%ebp,%ebx,4)
95+
vmovdqu %ymm0, -320(%ebp,%ebx,4)
96+
vmovdqu %ymm0, -288(%ebp,%ebx,4)
97+
vmovdqu %ymm0, -256(%ebp,%ebx,4)
98+
vmovdqu %ymm0, -224(%ebp,%ebx,4)
99+
vmovdqu %ymm0, -192(%ebp,%ebx,4)
100+
vmovdqu %ymm0, -160(%ebp,%ebx,4)
101+
vmovdqu %ymm0, -128(%ebp,%ebx,4)
102+
vmovdqu %ymm0, -96(%ebp,%ebx,4)
103+
vmovdqu %ymm0, -64(%ebp,%ebx,4)
104+
vmovdqu %ymm0, -32(%ebp,%ebx,4)
105+
vmovdqu %ymm0, (%ebp,%ebx,4)
106+
addl $256, %ebx # imm = 0x100
107+
addl $8, %esi
108+
jne .LBB0_4
109+
.LBB0_5:
110+
testl %edi, %edi
111+
movl 24(%esp), %eax
112+
je .LBB0_8
113+
# BB#6:
114+
leal (%eax,%ebx,4), %esi
115+
addl $96, %esi
116+
negl %edi
117+
.p2align 4, 0x90
118+
.LBB0_7: # =>This Inner Loop Header: Depth=1
119+
vmovdqu %ymm0, -96(%esi)
120+
vmovdqu %ymm0, -64(%esi)
121+
vmovdqu %ymm0, -32(%esi)
122+
vmovdqu %ymm0, (%esi)
123+
subl $-128, %esi
124+
addl $1, %edi
125+
jne .LBB0_7
126+
.LBB0_8:
127+
movl (%esp), %edi # 4-byte Reload
128+
cmpl %ecx, %edi
129+
je .LBB0_12
130+
# BB#9:
131+
leal (%eax,%edi,4), %esi
132+
.LBB0_10:
133+
subl %edi, %ecx
134+
.p2align 4, 0x90
135+
.LBB0_11: # =>This Inner Loop Header: Depth=1
136+
movl %edx, (%esi)
137+
addl $4, %esi
138+
addl $-1, %ecx
139+
jne .LBB0_11
140+
.LBB0_12:
141+
addl $4, %esp
142+
popl %esi
143+
popl %edi
144+
popl %ebx
145+
popl %ebp
146+
vzeroupper
147+
retl
148+
END(WMEMSET)

libc/arch-x86/static_function_dispatch.S

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ FUNCTION_DELEGATE(strcmp, strcmp_generic)
4545
FUNCTION_DELEGATE(strncmp, strncmp_generic)
4646
FUNCTION_DELEGATE(strcat, strcat_generic)
4747
FUNCTION_DELEGATE(wmemcmp, wmemcmp_freebsd)
48+
FUNCTION_DELEGATE(wmemset, wmemset_freebsd)
4849
FUNCTION_DELEGATE(wcscat, wcscat_freebsd)
4950
FUNCTION_DELEGATE(strncat, strncat_openbsd)
5051
FUNCTION_DELEGATE(strlcat, strlcat_openbsd)
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
/*
2+
Copyright (C) 2019 The Android Open Source Project
3+
All rights reserved.
4+
5+
Redistribution and use in source and binary forms, with or without
6+
modification, are permitted provided that the following conditions
7+
are met:
8+
* Redistributions of source code must retain the above copyright
9+
notice, this list of conditions and the following disclaimer.
10+
* Redistributions in binary form must reproduce the above copyright
11+
notice, this list of conditions and the following disclaimer in
12+
the documentation and/or other materials provided with the
13+
distribution.
14+
15+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18+
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19+
COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20+
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21+
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22+
OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23+
AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25+
OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26+
SUCH DAMAGE.
27+
*/
28+
29+
#include <private/bionic_asm.h>
30+
31+
#ifndef WMEMSET
32+
#define WMEMSET wmemset_avx2
33+
#endif
34+
35+
.section .text.avx2,"ax",@progbits
36+
37+
ENTRY (WMEMSET)
38+
# BB#0:
39+
testq %rdx, %rdx
40+
je .LBB0_14
41+
# BB#1:
42+
cmpq $32, %rdx
43+
jae .LBB0_3
44+
# BB#2:
45+
xorl %r8d, %r8d
46+
movq %rdi, %rax
47+
jmp .LBB0_12
48+
.LBB0_3:
49+
movq %rdx, %r8
50+
andq $-32, %r8
51+
vmovd %esi, %xmm0
52+
vpbroadcastd %xmm0, %ymm0
53+
leaq -32(%r8), %rcx
54+
movq %rcx, %rax
55+
shrq $5, %rax
56+
leal 1(%rax), %r9d
57+
andl $7, %r9d
58+
cmpq $224, %rcx
59+
jae .LBB0_5
60+
# BB#4:
61+
xorl %eax, %eax
62+
testq %r9, %r9
63+
jne .LBB0_8
64+
jmp .LBB0_10
65+
.LBB0_5:
66+
leaq 992(%rdi), %rcx
67+
leaq -1(%r9), %r10
68+
subq %rax, %r10
69+
xorl %eax, %eax
70+
.p2align 4, 0x90
71+
.LBB0_6: # =>This Inner Loop Header: Depth=1
72+
vmovdqu %ymm0, -992(%rcx,%rax,4)
73+
vmovdqu %ymm0, -960(%rcx,%rax,4)
74+
vmovdqu %ymm0, -928(%rcx,%rax,4)
75+
vmovdqu %ymm0, -896(%rcx,%rax,4)
76+
vmovdqu %ymm0, -864(%rcx,%rax,4)
77+
vmovdqu %ymm0, -832(%rcx,%rax,4)
78+
vmovdqu %ymm0, -800(%rcx,%rax,4)
79+
vmovdqu %ymm0, -768(%rcx,%rax,4)
80+
vmovdqu %ymm0, -736(%rcx,%rax,4)
81+
vmovdqu %ymm0, -704(%rcx,%rax,4)
82+
vmovdqu %ymm0, -672(%rcx,%rax,4)
83+
vmovdqu %ymm0, -640(%rcx,%rax,4)
84+
vmovdqu %ymm0, -608(%rcx,%rax,4)
85+
vmovdqu %ymm0, -576(%rcx,%rax,4)
86+
vmovdqu %ymm0, -544(%rcx,%rax,4)
87+
vmovdqu %ymm0, -512(%rcx,%rax,4)
88+
vmovdqu %ymm0, -480(%rcx,%rax,4)
89+
vmovdqu %ymm0, -448(%rcx,%rax,4)
90+
vmovdqu %ymm0, -416(%rcx,%rax,4)
91+
vmovdqu %ymm0, -384(%rcx,%rax,4)
92+
vmovdqu %ymm0, -352(%rcx,%rax,4)
93+
vmovdqu %ymm0, -320(%rcx,%rax,4)
94+
vmovdqu %ymm0, -288(%rcx,%rax,4)
95+
vmovdqu %ymm0, -256(%rcx,%rax,4)
96+
vmovdqu %ymm0, -224(%rcx,%rax,4)
97+
vmovdqu %ymm0, -192(%rcx,%rax,4)
98+
vmovdqu %ymm0, -160(%rcx,%rax,4)
99+
vmovdqu %ymm0, -128(%rcx,%rax,4)
100+
vmovdqu %ymm0, -96(%rcx,%rax,4)
101+
vmovdqu %ymm0, -64(%rcx,%rax,4)
102+
vmovdqu %ymm0, -32(%rcx,%rax,4)
103+
vmovdqu %ymm0, (%rcx,%rax,4)
104+
addq $256, %rax # imm = 0x100
105+
addq $8, %r10
106+
jne .LBB0_6
107+
# BB#7:
108+
testq %r9, %r9
109+
je .LBB0_10
110+
.LBB0_8:
111+
leaq (%rdi,%rax,4), %rax
112+
addq $96, %rax
113+
negq %r9
114+
.p2align 4, 0x90
115+
.LBB0_9: # =>This Inner Loop Header: Depth=1
116+
vmovdqu %ymm0, -96(%rax)
117+
vmovdqu %ymm0, -64(%rax)
118+
vmovdqu %ymm0, -32(%rax)
119+
vmovdqu %ymm0, (%rax)
120+
subq $-128, %rax
121+
addq $1, %r9
122+
jne .LBB0_9
123+
.LBB0_10:
124+
cmpq %rdx, %r8
125+
je .LBB0_14
126+
# BB#11:
127+
leaq (%rdi,%r8,4), %rax
128+
.LBB0_12:
129+
subq %r8, %rdx
130+
.p2align 4, 0x90
131+
.LBB0_13: # =>This Inner Loop Header: Depth=1
132+
movl %esi, (%rax)
133+
addq $4, %rax
134+
addq $-1, %rdx
135+
jne .LBB0_13
136+
.LBB0_14:
137+
movq %rdi, %rax
138+
vzeroupper
139+
retq
140+
END(WMEMSET)

0 commit comments

Comments
 (0)