|
| 1 | +// Copyright 2018 The Go Authors. All rights reserved. |
| 2 | +// Use of this source code is governed by a BSD-style |
| 3 | +// license that can be found in the LICENSE file. |
| 4 | + |
| 5 | +// +build go1.11 |
| 6 | +// +build !gccgo,!appengine |
| 7 | + |
| 8 | +#include "textflag.h" |
| 9 | + |
| 10 | +#define NUM_ROUNDS 10 |
| 11 | + |
| 12 | +// func xorKeyStreamVX(dst, src []byte, key *[8]uint32, nonce *[3]uint32, counter *uint32) |
| 13 | +TEXT ·xorKeyStreamVX(SB), NOSPLIT, $0 |
| 14 | + MOVD dst+0(FP), R1 |
| 15 | + MOVD src+24(FP), R2 |
| 16 | + MOVD src_len+32(FP), R3 |
| 17 | + MOVD key+48(FP), R4 |
| 18 | + MOVD nonce+56(FP), R6 |
| 19 | + MOVD counter+64(FP), R7 |
| 20 | + |
| 21 | + MOVD $·constants(SB), R10 |
| 22 | + MOVD $·incRotMatrix(SB), R11 |
| 23 | + |
| 24 | + MOVW (R7), R20 |
| 25 | + |
| 26 | + AND $~255, R3, R13 |
| 27 | + ADD R2, R13, R12 // R12 for block end |
| 28 | + AND $255, R3, R13 |
| 29 | +loop: |
| 30 | + MOVD $NUM_ROUNDS, R21 |
| 31 | + VLD1 (R11), [V30.S4, V31.S4] |
| 32 | + |
| 33 | + // load contants |
| 34 | + // VLD4R (R10), [V0.S4, V1.S4, V2.S4, V3.S4] |
| 35 | + WORD $0x4D60E940 |
| 36 | + |
| 37 | + // load keys |
| 38 | + // VLD4R 16(R4), [V4.S4, V5.S4, V6.S4, V7.S4] |
| 39 | + WORD $0x4DFFE884 |
| 40 | + // VLD4R 16(R4), [V8.S4, V9.S4, V10.S4, V11.S4] |
| 41 | + WORD $0x4DFFE888 |
| 42 | + SUB $32, R4 |
| 43 | + |
| 44 | + // load counter + nonce |
| 45 | + // VLD1R (R7), [V12.S4] |
| 46 | + WORD $0x4D40C8EC |
| 47 | + |
| 48 | + // VLD3R (R6), [V13.S4, V14.S4, V15.S4] |
| 49 | + WORD $0x4D40E8CD |
| 50 | + |
| 51 | + // update counter |
| 52 | + VADD V30.S4, V12.S4, V12.S4 |
| 53 | + |
| 54 | +chacha: |
| 55 | + // V0..V3 += V4..V7 |
| 56 | + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 16) |
| 57 | + VADD V0.S4, V4.S4, V0.S4 |
| 58 | + VADD V1.S4, V5.S4, V1.S4 |
| 59 | + VADD V2.S4, V6.S4, V2.S4 |
| 60 | + VADD V3.S4, V7.S4, V3.S4 |
| 61 | + VEOR V12.B16, V0.B16, V12.B16 |
| 62 | + VEOR V13.B16, V1.B16, V13.B16 |
| 63 | + VEOR V14.B16, V2.B16, V14.B16 |
| 64 | + VEOR V15.B16, V3.B16, V15.B16 |
| 65 | + VREV32 V12.H8, V12.H8 |
| 66 | + VREV32 V13.H8, V13.H8 |
| 67 | + VREV32 V14.H8, V14.H8 |
| 68 | + VREV32 V15.H8, V15.H8 |
| 69 | + // V8..V11 += V12..V15 |
| 70 | + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 12) |
| 71 | + VADD V8.S4, V12.S4, V8.S4 |
| 72 | + VADD V9.S4, V13.S4, V9.S4 |
| 73 | + VADD V10.S4, V14.S4, V10.S4 |
| 74 | + VADD V11.S4, V15.S4, V11.S4 |
| 75 | + VEOR V8.B16, V4.B16, V16.B16 |
| 76 | + VEOR V9.B16, V5.B16, V17.B16 |
| 77 | + VEOR V10.B16, V6.B16, V18.B16 |
| 78 | + VEOR V11.B16, V7.B16, V19.B16 |
| 79 | + VSHL $12, V16.S4, V4.S4 |
| 80 | + VSHL $12, V17.S4, V5.S4 |
| 81 | + VSHL $12, V18.S4, V6.S4 |
| 82 | + VSHL $12, V19.S4, V7.S4 |
| 83 | + VSRI $20, V16.S4, V4.S4 |
| 84 | + VSRI $20, V17.S4, V5.S4 |
| 85 | + VSRI $20, V18.S4, V6.S4 |
| 86 | + VSRI $20, V19.S4, V7.S4 |
| 87 | + |
| 88 | + // V0..V3 += V4..V7 |
| 89 | + // V12..V15 <<<= ((V12..V15 XOR V0..V3), 8) |
| 90 | + VADD V0.S4, V4.S4, V0.S4 |
| 91 | + VADD V1.S4, V5.S4, V1.S4 |
| 92 | + VADD V2.S4, V6.S4, V2.S4 |
| 93 | + VADD V3.S4, V7.S4, V3.S4 |
| 94 | + VEOR V12.B16, V0.B16, V12.B16 |
| 95 | + VEOR V13.B16, V1.B16, V13.B16 |
| 96 | + VEOR V14.B16, V2.B16, V14.B16 |
| 97 | + VEOR V15.B16, V3.B16, V15.B16 |
| 98 | + VTBL V31.B16, [V12.B16], V12.B16 |
| 99 | + VTBL V31.B16, [V13.B16], V13.B16 |
| 100 | + VTBL V31.B16, [V14.B16], V14.B16 |
| 101 | + VTBL V31.B16, [V15.B16], V15.B16 |
| 102 | + |
| 103 | + // V8..V11 += V12..V15 |
| 104 | + // V4..V7 <<<= ((V4..V7 XOR V8..V11), 7) |
| 105 | + VADD V12.S4, V8.S4, V8.S4 |
| 106 | + VADD V13.S4, V9.S4, V9.S4 |
| 107 | + VADD V14.S4, V10.S4, V10.S4 |
| 108 | + VADD V15.S4, V11.S4, V11.S4 |
| 109 | + VEOR V8.B16, V4.B16, V16.B16 |
| 110 | + VEOR V9.B16, V5.B16, V17.B16 |
| 111 | + VEOR V10.B16, V6.B16, V18.B16 |
| 112 | + VEOR V11.B16, V7.B16, V19.B16 |
| 113 | + VSHL $7, V16.S4, V4.S4 |
| 114 | + VSHL $7, V17.S4, V5.S4 |
| 115 | + VSHL $7, V18.S4, V6.S4 |
| 116 | + VSHL $7, V19.S4, V7.S4 |
| 117 | + VSRI $25, V16.S4, V4.S4 |
| 118 | + VSRI $25, V17.S4, V5.S4 |
| 119 | + VSRI $25, V18.S4, V6.S4 |
| 120 | + VSRI $25, V19.S4, V7.S4 |
| 121 | + |
| 122 | + // V0..V3 += V5..V7, V4 |
| 123 | + // V15,V12-V14 <<<= ((V15,V12-V14 XOR V0..V3), 16) |
| 124 | + VADD V0.S4, V5.S4, V0.S4 |
| 125 | + VADD V1.S4, V6.S4, V1.S4 |
| 126 | + VADD V2.S4, V7.S4, V2.S4 |
| 127 | + VADD V3.S4, V4.S4, V3.S4 |
| 128 | + VEOR V15.B16, V0.B16, V15.B16 |
| 129 | + VEOR V12.B16, V1.B16, V12.B16 |
| 130 | + VEOR V13.B16, V2.B16, V13.B16 |
| 131 | + VEOR V14.B16, V3.B16, V14.B16 |
| 132 | + VREV32 V12.H8, V12.H8 |
| 133 | + VREV32 V13.H8, V13.H8 |
| 134 | + VREV32 V14.H8, V14.H8 |
| 135 | + VREV32 V15.H8, V15.H8 |
| 136 | + |
| 137 | + // V10 += V15; V5 <<<= ((V10 XOR V5), 12) |
| 138 | + // ... |
| 139 | + VADD V15.S4, V10.S4, V10.S4 |
| 140 | + VADD V12.S4, V11.S4, V11.S4 |
| 141 | + VADD V13.S4, V8.S4, V8.S4 |
| 142 | + VADD V14.S4, V9.S4, V9.S4 |
| 143 | + VEOR V10.B16, V5.B16, V16.B16 |
| 144 | + VEOR V11.B16, V6.B16, V17.B16 |
| 145 | + VEOR V8.B16, V7.B16, V18.B16 |
| 146 | + VEOR V9.B16, V4.B16, V19.B16 |
| 147 | + VSHL $12, V16.S4, V5.S4 |
| 148 | + VSHL $12, V17.S4, V6.S4 |
| 149 | + VSHL $12, V18.S4, V7.S4 |
| 150 | + VSHL $12, V19.S4, V4.S4 |
| 151 | + VSRI $20, V16.S4, V5.S4 |
| 152 | + VSRI $20, V17.S4, V6.S4 |
| 153 | + VSRI $20, V18.S4, V7.S4 |
| 154 | + VSRI $20, V19.S4, V4.S4 |
| 155 | + |
| 156 | + // V0 += V5; V15 <<<= ((V0 XOR V15), 8) |
| 157 | + // ... |
| 158 | + VADD V5.S4, V0.S4, V0.S4 |
| 159 | + VADD V6.S4, V1.S4, V1.S4 |
| 160 | + VADD V7.S4, V2.S4, V2.S4 |
| 161 | + VADD V4.S4, V3.S4, V3.S4 |
| 162 | + VEOR V0.B16, V15.B16, V15.B16 |
| 163 | + VEOR V1.B16, V12.B16, V12.B16 |
| 164 | + VEOR V2.B16, V13.B16, V13.B16 |
| 165 | + VEOR V3.B16, V14.B16, V14.B16 |
| 166 | + VTBL V31.B16, [V12.B16], V12.B16 |
| 167 | + VTBL V31.B16, [V13.B16], V13.B16 |
| 168 | + VTBL V31.B16, [V14.B16], V14.B16 |
| 169 | + VTBL V31.B16, [V15.B16], V15.B16 |
| 170 | + |
| 171 | + // V10 += V15; V5 <<<= ((V10 XOR V5), 7) |
| 172 | + // ... |
| 173 | + VADD V15.S4, V10.S4, V10.S4 |
| 174 | + VADD V12.S4, V11.S4, V11.S4 |
| 175 | + VADD V13.S4, V8.S4, V8.S4 |
| 176 | + VADD V14.S4, V9.S4, V9.S4 |
| 177 | + VEOR V10.B16, V5.B16, V16.B16 |
| 178 | + VEOR V11.B16, V6.B16, V17.B16 |
| 179 | + VEOR V8.B16, V7.B16, V18.B16 |
| 180 | + VEOR V9.B16, V4.B16, V19.B16 |
| 181 | + VSHL $7, V16.S4, V5.S4 |
| 182 | + VSHL $7, V17.S4, V6.S4 |
| 183 | + VSHL $7, V18.S4, V7.S4 |
| 184 | + VSHL $7, V19.S4, V4.S4 |
| 185 | + VSRI $25, V16.S4, V5.S4 |
| 186 | + VSRI $25, V17.S4, V6.S4 |
| 187 | + VSRI $25, V18.S4, V7.S4 |
| 188 | + VSRI $25, V19.S4, V4.S4 |
| 189 | + |
| 190 | + SUB $1, R21 |
| 191 | + CBNZ R21, chacha |
| 192 | + |
| 193 | + // VLD4R (R10), [V16.S4, V17.S4, V18.S4, V19.S4] |
| 194 | + WORD $0x4D60E950 |
| 195 | + |
| 196 | + // VLD4R 16(R4), [V20.S4, V21.S4, V22.S4, V23.S4] |
| 197 | + WORD $0x4DFFE894 |
| 198 | + VADD V30.S4, V12.S4, V12.S4 |
| 199 | + VADD V16.S4, V0.S4, V0.S4 |
| 200 | + VADD V17.S4, V1.S4, V1.S4 |
| 201 | + VADD V18.S4, V2.S4, V2.S4 |
| 202 | + VADD V19.S4, V3.S4, V3.S4 |
| 203 | + // VLD4R 16(R4), [V24.S4, V25.S4, V26.S4, V27.S4] |
| 204 | + WORD $0x4DFFE898 |
| 205 | + // restore R4 |
| 206 | + SUB $32, R4 |
| 207 | + |
| 208 | + // load counter + nonce |
| 209 | + // VLD1R (R7), [V28.S4] |
| 210 | + WORD $0x4D40C8FC |
| 211 | + // VLD3R (R6), [V29.S4, V30.S4, V31.S4] |
| 212 | + WORD $0x4D40E8DD |
| 213 | + |
| 214 | + VADD V20.S4, V4.S4, V4.S4 |
| 215 | + VADD V21.S4, V5.S4, V5.S4 |
| 216 | + VADD V22.S4, V6.S4, V6.S4 |
| 217 | + VADD V23.S4, V7.S4, V7.S4 |
| 218 | + VADD V24.S4, V8.S4, V8.S4 |
| 219 | + VADD V25.S4, V9.S4, V9.S4 |
| 220 | + VADD V26.S4, V10.S4, V10.S4 |
| 221 | + VADD V27.S4, V11.S4, V11.S4 |
| 222 | + VADD V28.S4, V12.S4, V12.S4 |
| 223 | + VADD V29.S4, V13.S4, V13.S4 |
| 224 | + VADD V30.S4, V14.S4, V14.S4 |
| 225 | + VADD V31.S4, V15.S4, V15.S4 |
| 226 | + |
| 227 | + VZIP1 V1.S4, V0.S4, V16.S4 |
| 228 | + VZIP2 V1.S4, V0.S4, V17.S4 |
| 229 | + VZIP1 V3.S4, V2.S4, V18.S4 |
| 230 | + VZIP2 V3.S4, V2.S4, V19.S4 |
| 231 | + VZIP1 V5.S4, V4.S4, V20.S4 |
| 232 | + VZIP2 V5.S4, V4.S4, V21.S4 |
| 233 | + VZIP1 V7.S4, V6.S4, V22.S4 |
| 234 | + VZIP2 V7.S4, V6.S4, V23.S4 |
| 235 | + VZIP1 V9.S4, V8.S4, V24.S4 |
| 236 | + VZIP2 V9.S4, V8.S4, V25.S4 |
| 237 | + VZIP1 V11.S4, V10.S4, V26.S4 |
| 238 | + VZIP2 V11.S4, V10.S4, V27.S4 |
| 239 | + VZIP1 V13.S4, V12.S4, V28.S4 |
| 240 | + VZIP2 V13.S4, V12.S4, V29.S4 |
| 241 | + VZIP1 V15.S4, V14.S4, V30.S4 |
| 242 | + VZIP2 V15.S4, V14.S4, V31.S4 |
| 243 | + VZIP1 V18.D2, V16.D2, V0.D2 |
| 244 | + VZIP2 V18.D2, V16.D2, V4.D2 |
| 245 | + VZIP1 V19.D2, V17.D2, V8.D2 |
| 246 | + VZIP2 V19.D2, V17.D2, V12.D2 |
| 247 | + VLD1.P 64(R2), [V16.B16, V17.B16, V18.B16, V19.B16] |
| 248 | + |
| 249 | + VZIP1 V22.D2, V20.D2, V1.D2 |
| 250 | + VZIP2 V22.D2, V20.D2, V5.D2 |
| 251 | + VZIP1 V23.D2, V21.D2, V9.D2 |
| 252 | + VZIP2 V23.D2, V21.D2, V13.D2 |
| 253 | + VLD1.P 64(R2), [V20.B16, V21.B16, V22.B16, V23.B16] |
| 254 | + VZIP1 V26.D2, V24.D2, V2.D2 |
| 255 | + VZIP2 V26.D2, V24.D2, V6.D2 |
| 256 | + VZIP1 V27.D2, V25.D2, V10.D2 |
| 257 | + VZIP2 V27.D2, V25.D2, V14.D2 |
| 258 | + VLD1.P 64(R2), [V24.B16, V25.B16, V26.B16, V27.B16] |
| 259 | + VZIP1 V30.D2, V28.D2, V3.D2 |
| 260 | + VZIP2 V30.D2, V28.D2, V7.D2 |
| 261 | + VZIP1 V31.D2, V29.D2, V11.D2 |
| 262 | + VZIP2 V31.D2, V29.D2, V15.D2 |
| 263 | + VLD1.P 64(R2), [V28.B16, V29.B16, V30.B16, V31.B16] |
| 264 | + VEOR V0.B16, V16.B16, V16.B16 |
| 265 | + VEOR V1.B16, V17.B16, V17.B16 |
| 266 | + VEOR V2.B16, V18.B16, V18.B16 |
| 267 | + VEOR V3.B16, V19.B16, V19.B16 |
| 268 | + VST1.P [V16.B16, V17.B16, V18.B16, V19.B16], 64(R1) |
| 269 | + VEOR V4.B16, V20.B16, V20.B16 |
| 270 | + VEOR V5.B16, V21.B16, V21.B16 |
| 271 | + VEOR V6.B16, V22.B16, V22.B16 |
| 272 | + VEOR V7.B16, V23.B16, V23.B16 |
| 273 | + VST1.P [V20.B16, V21.B16, V22.B16, V23.B16], 64(R1) |
| 274 | + VEOR V8.B16, V24.B16, V24.B16 |
| 275 | + VEOR V9.B16, V25.B16, V25.B16 |
| 276 | + VEOR V10.B16, V26.B16, V26.B16 |
| 277 | + VEOR V11.B16, V27.B16, V27.B16 |
| 278 | + VST1.P [V24.B16, V25.B16, V26.B16, V27.B16], 64(R1) |
| 279 | + VEOR V12.B16, V28.B16, V28.B16 |
| 280 | + VEOR V13.B16, V29.B16, V29.B16 |
| 281 | + VEOR V14.B16, V30.B16, V30.B16 |
| 282 | + VEOR V15.B16, V31.B16, V31.B16 |
| 283 | + VST1.P [V28.B16, V29.B16, V30.B16, V31.B16], 64(R1) |
| 284 | + |
| 285 | + ADD $4, R20 |
| 286 | + MOVW R20, (R7) // update counter |
| 287 | + |
| 288 | + CMP R2, R12 |
| 289 | + BGT loop |
| 290 | + |
| 291 | + RET |
| 292 | + |
| 293 | + |
| 294 | +DATA ·constants+0x00(SB)/4, $0x61707865 |
| 295 | +DATA ·constants+0x04(SB)/4, $0x3320646e |
| 296 | +DATA ·constants+0x08(SB)/4, $0x79622d32 |
| 297 | +DATA ·constants+0x0c(SB)/4, $0x6b206574 |
| 298 | +GLOBL ·constants(SB), NOPTR|RODATA, $32 |
| 299 | + |
| 300 | +DATA ·incRotMatrix+0x00(SB)/4, $0x00000000 |
| 301 | +DATA ·incRotMatrix+0x04(SB)/4, $0x00000001 |
| 302 | +DATA ·incRotMatrix+0x08(SB)/4, $0x00000002 |
| 303 | +DATA ·incRotMatrix+0x0c(SB)/4, $0x00000003 |
| 304 | +DATA ·incRotMatrix+0x10(SB)/4, $0x02010003 |
| 305 | +DATA ·incRotMatrix+0x14(SB)/4, $0x06050407 |
| 306 | +DATA ·incRotMatrix+0x18(SB)/4, $0x0A09080B |
| 307 | +DATA ·incRotMatrix+0x1c(SB)/4, $0x0E0D0C0F |
| 308 | +GLOBL ·incRotMatrix(SB), NOPTR|RODATA, $32 |
0 commit comments