Skip to content

Commit b369b72

Browse files
sophie-zhaoabner-chenc
authored andcommitted
crypto/internal/poly1305: implement function update in assembly on loong64
The performance improvements on Loongson-3A5000 and Loongson-3A6000 are as follows: goos: linux goarch: loong64 pkg: golang.org/x/crypto/internal/poly1305 cpu: Loongson-3A5000 @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | 64 122.8n ± 0% 100.0n ± 0% -18.57% (p=0.000 n=10) 1K 1152.0n ± 0% 732.2n ± 0% -36.44% (p=0.000 n=10) 2M 2.356m ± 0% 1.443m ± 0% -38.74% (p=0.000 n=10) 64Unaligned 122.7n ± 0% 101.5n ± 0% -17.24% (p=0.000 n=10) 1KUnaligned 1152.0n ± 0% 745.4n ± 0% -35.30% (p=0.000 n=10) 2MUnaligned 2.336m ± 0% 1.473m ± 0% -36.94% (p=0.000 n=10) Write64 77.92n ± 0% 54.88n ± 0% -29.57% (p=0.000 n=10) Write1K 1106.0n ± 0% 683.3n ± 0% -38.22% (p=0.000 n=10) Write2M 2.356m ± 0% 1.444m ± 0% -38.72% (p=0.000 n=10) Write64Unaligned 77.87n ± 0% 55.69n ± 0% -28.49% (p=0.000 n=10) Write1KUnaligned 1106.0n ± 0% 708.1n ± 0% -35.97% (p=0.000 n=10) Write2MUnaligned 2.335m ± 0% 1.471m ± 0% -37.01% (p=0.000 n=10) geomean 6.373µ 4.272µ -32.96% | bench.old | bench.new | | B/s | B/s vs base | 64 497.1Mi ± 0% 610.3Mi ± 0% +22.78% (p=0.000 n=10) 1K 847.6Mi ± 0% 1333.7Mi ± 0% +57.35% (p=0.000 n=10) 2M 849.0Mi ± 0% 1385.9Mi ± 0% +63.24% (p=0.000 n=10) 64Unaligned 497.4Mi ± 0% 600.9Mi ± 0% +20.81% (p=0.000 n=10) 1KUnaligned 847.6Mi ± 0% 1310.1Mi ± 0% +54.57% (p=0.000 n=10) 2MUnaligned 856.3Mi ± 0% 1357.9Mi ± 0% +58.58% (p=0.000 n=10) Write64 783.3Mi ± 0% 1112.2Mi ± 0% +41.99% (p=0.000 n=10) Write1K 882.8Mi ± 0% 1429.1Mi ± 0% +61.88% (p=0.000 n=10) Write2M 849.0Mi ± 0% 1385.4Mi ± 0% +63.18% (p=0.000 n=10) Write64Unaligned 783.8Mi ± 0% 1096.1Mi ± 0% +39.85% (p=0.000 n=10) Write1KUnaligned 882.8Mi ± 0% 1379.0Mi ± 0% +56.20% (p=0.000 n=10) Write2MUnaligned 856.5Mi ± 0% 1359.9Mi ± 0% +58.76% (p=0.000 n=10) geomean 772.2Mi 1.125Gi +49.18% goos: linux goarch: loong64 pkg: golang.org/x/crypto/internal/poly1305 cpu: Loongson-3A6000-HV @ 2500.00MHz | bench.old | bench.new | | sec/op | sec/op vs base | 64 92.06n ± 0% 71.55n ± 0% -22.28% (p=0.000 n=10) 1K 998.4n ± 0% 607.7n ± 0% -39.13% (p=0.000 n=10) 2M 1.976m ± 0% 1.165m ± 0% -41.07% (p=0.000 n=10) 64Unaligned 92.05n ± 0% 71.55n ± 0% -22.27% (p=0.000 n=10) 1KUnaligned 998.3n ± 0% 607.6n ± 0% -39.13% (p=0.000 n=10) 2MUnaligned 1.975m ± 0% 1.222m ± 0% -38.11% (p=0.000 n=10) Write64 65.24n ± 0% 45.23n ± 0% -30.67% (p=0.000 n=10) Write1K 970.8n ± 0% 577.6n ± 0% -40.51% (p=0.000 n=10) Write2M 1.965m ± 0% 1.163m ± 0% -40.81% (p=0.000 n=10) Write64Unaligned 65.24n ± 0% 45.24n ± 0% -30.66% (p=0.000 n=10) Write1KUnaligned 970.8n ± 0% 577.6n ± 0% -40.50% (p=0.000 n=10) Write2MUnaligned 1.965m ± 0% 1.222m ± 0% -37.81% (p=0.000 n=10) geomean 5.317µ 3.426µ -35.58% | bench.old | bench.new | | B/s | B/s vs base | 64 663.0Mi ± 0% 853.1Mi ± 0% +28.67% (p=0.000 n=10) 1K 978.1Mi ± 0% 1606.9Mi ± 0% +64.28% (p=0.000 n=10) 2M 1012.0Mi ± 0% 1717.4Mi ± 0% +69.70% (p=0.000 n=10) 64Unaligned 663.1Mi ± 0% 853.1Mi ± 0% +28.65% (p=0.000 n=10) 1KUnaligned 978.2Mi ± 0% 1607.1Mi ± 0% +64.29% (p=0.000 n=10) 2MUnaligned 1012.6Mi ± 0% 1636.2Mi ± 0% +61.58% (p=0.000 n=10) Write64 935.5Mi ± 0% 1349.3Mi ± 0% +44.23% (p=0.000 n=10) Write1K 1005.9Mi ± 0% 1690.9Mi ± 0% +68.09% (p=0.000 n=10) Write2M 1017.7Mi ± 0% 1719.5Mi ± 0% +68.95% (p=0.000 n=10) Write64Unaligned 935.6Mi ± 0% 1349.3Mi ± 0% +44.22% (p=0.000 n=10) Write1KUnaligned 1006.0Mi ± 0% 1690.9Mi ± 0% +68.08% (p=0.000 n=10) Write2MUnaligned 1017.7Mi ± 0% 1636.4Mi ± 0% +60.80% (p=0.000 n=10) geomean 925.6Mi 1.403Gi +55.22% Change-Id: If05a8bfc868b3e6f903ff169eed7a894af741f9b Reviewed-on: https://go-review.googlesource.com/c/crypto/+/638455 Reviewed-by: David Chase <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Junyang Shao <[email protected]> Reviewed-by: abner chenc <[email protected]>
1 parent 6b853fb commit b369b72

File tree

4 files changed

+125
-49
lines changed

4 files changed

+125
-49
lines changed

Diff for: internal/poly1305/mac_noasm.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego
5+
//go:build (!amd64 && !loong64 && !ppc64le && !ppc64 && !s390x) || !gc || purego
66

77
package poly1305
88

Diff for: internal/poly1305/sum_amd64.go renamed to internal/poly1305/sum_asm.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
// Use of this source code is governed by a BSD-style
33
// license that can be found in the LICENSE file.
44

5-
//go:build gc && !purego
5+
//go:build gc && !purego && (amd64 || loong64 || ppc64 || ppc64le)
66

77
package poly1305
88

Diff for: internal/poly1305/sum_loong64.s

+123
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
// Copyright 2025 The Go Authors. All rights reserved.
2+
// Use of this source code is governed by a BSD-style
3+
// license that can be found in the LICENSE file.
4+
5+
//go:build gc && !purego
6+
7+
// func update(state *macState, msg []byte)
8+
TEXT ·update(SB), $0-32
9+
MOVV state+0(FP), R4
10+
MOVV msg_base+8(FP), R5
11+
MOVV msg_len+16(FP), R6
12+
13+
MOVV $0x10, R7
14+
15+
MOVV (R4), R8 // h0
16+
MOVV 8(R4), R9 // h1
17+
MOVV 16(R4), R10 // h2
18+
MOVV 24(R4), R11 // r0
19+
MOVV 32(R4), R12 // r1
20+
21+
BLT R6, R7, bytes_between_0_and_15
22+
23+
loop:
24+
MOVV (R5), R14 // msg[0:8]
25+
MOVV 8(R5), R16 // msg[8:16]
26+
ADDV R14, R8, R8 // h0 (x1 + y1 = z1', if z1' < x1 then z1' overflow)
27+
ADDV R16, R9, R27
28+
SGTU R14, R8, R24 // h0.carry
29+
SGTU R9, R27, R28
30+
ADDV R27, R24, R9 // h1
31+
SGTU R27, R9, R24
32+
OR R24, R28, R24 // h1.carry
33+
ADDV $0x01, R24, R24
34+
ADDV R10, R24, R10 // h2
35+
36+
ADDV $16, R5, R5 // msg = msg[16:]
37+
38+
multiply:
39+
MULV R8, R11, R14 // h0r0.lo
40+
MULHVU R8, R11, R15 // h0r0.hi
41+
MULV R9, R11, R13 // h1r0.lo
42+
MULHVU R9, R11, R16 // h1r0.hi
43+
ADDV R13, R15, R15
44+
SGTU R13, R15, R24
45+
ADDV R24, R16, R16
46+
MULV R10, R11, R25
47+
ADDV R16, R25, R25
48+
MULV R8, R12, R13 // h0r1.lo
49+
MULHVU R8, R12, R16 // h0r1.hi
50+
ADDV R13, R15, R15
51+
SGTU R13, R15, R24
52+
ADDV R24, R16, R16
53+
MOVV R16, R8
54+
MULV R10, R12, R26 // h2r1
55+
MULV R9, R12, R13 // h1r1.lo
56+
MULHVU R9, R12, R16 // h1r1.hi
57+
ADDV R13, R25, R25
58+
ADDV R16, R26, R27
59+
SGTU R13, R25, R24
60+
ADDV R27, R24, R26
61+
ADDV R8, R25, R25
62+
SGTU R8, R25, R24
63+
ADDV R24, R26, R26
64+
AND $3, R25, R10
65+
AND $-4, R25, R17
66+
ADDV R17, R14, R8
67+
ADDV R26, R15, R27
68+
SGTU R17, R8, R24
69+
SGTU R26, R27, R28
70+
ADDV R27, R24, R9
71+
SGTU R27, R9, R24
72+
OR R24, R28, R24
73+
ADDV R24, R10, R10
74+
SLLV $62, R26, R27
75+
SRLV $2, R25, R28
76+
SRLV $2, R26, R26
77+
OR R27, R28, R25
78+
ADDV R25, R8, R8
79+
ADDV R26, R9, R27
80+
SGTU R25, R8, R24
81+
SGTU R26, R27, R28
82+
ADDV R27, R24, R9
83+
SGTU R27, R9, R24
84+
OR R24, R28, R24
85+
ADDV R24, R10, R10
86+
87+
SUBV $16, R6, R6
88+
BGE R6, R7, loop
89+
90+
bytes_between_0_and_15:
91+
BEQ R6, R0, done
92+
MOVV $1, R14
93+
XOR R15, R15
94+
ADDV R6, R5, R5
95+
96+
flush_buffer:
97+
MOVBU -1(R5), R25
98+
SRLV $56, R14, R24
99+
SLLV $8, R15, R28
100+
SLLV $8, R14, R14
101+
OR R24, R28, R15
102+
XOR R25, R14, R14
103+
SUBV $1, R6, R6
104+
SUBV $1, R5, R5
105+
BNE R6, R0, flush_buffer
106+
107+
ADDV R14, R8, R8
108+
SGTU R14, R8, R24
109+
ADDV R15, R9, R27
110+
SGTU R15, R27, R28
111+
ADDV R27, R24, R9
112+
SGTU R27, R9, R24
113+
OR R24, R28, R24
114+
ADDV R10, R24, R10
115+
116+
MOVV $16, R6
117+
JMP multiply
118+
119+
done:
120+
MOVV R8, (R4)
121+
MOVV R9, 8(R4)
122+
MOVV R10, 16(R4)
123+
RET

Diff for: internal/poly1305/sum_ppc64x.go

-47
This file was deleted.

0 commit comments

Comments
 (0)