blob: ac14a99241652881e47019717f8545a3bb1e8c69 [file] [log] [blame]
Kyle Swenson7d38e032023-07-10 11:16:56 -06001#!/usr/bin/env perl
2# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
3#
4# This code is taken from the OpenSSL project but the author, Andy Polyakov,
5# has relicensed it under the licenses specified in the SPDX header above.
6# The original headers, including the original license headers, are
7# included below for completeness.
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# June 2015
17#
18# ChaCha20 for ARMv8.
19#
20# Performance in cycles per byte out of large buffer.
21#
22# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU(*)
23#
24# Apple A7 5.50/+49% 3.33 1.70
25# Cortex-A53 8.40/+80% 4.72 4.72(**)
26# Cortex-A57 8.06/+43% 4.90 4.43(***)
27# Denver 4.50/+82% 2.63 2.67(**)
28# X-Gene 9.50/+46% 8.82 8.89(**)
29# Mongoose 8.00/+44% 3.64 3.25(***)
30# Kryo 8.17/+50% 4.83 4.65(***)
31#
32# (*) since no non-Apple processor exhibits significantly better
33# performance, the code path is #ifdef __APPLE__-ed;
34# (**) it's expected that doubling interleave factor doesn't help
35# all processors, only those with higher NEON latency and
36# higher instruction issue rate;
37# (***) expected improvement was actually higher;
38
39$flavour=shift;
40if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
41else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
42
43if ($flavour && $flavour ne "void") {
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
47 die "can't locate arm-xlate.pl";
48
49 open STDOUT,"| \"$^X\" $xlate $flavour $output";
50} else {
51 open STDOUT,">$output";
52}
53
54sub AUTOLOAD() # thunk [simplified] x86-style perlasm
55{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
56 my $arg = pop;
57 $arg = "#$arg" if ($arg*1 eq $arg);
58 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
59}
60
61my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4));
62
63my @x=map("x$_",(5..17,19..21));
64my @d=map("x$_",(22..28,30));
65
66sub ROUND {
67my ($a0,$b0,$c0,$d0)=@_;
68my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
69my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
70my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
71
72 (
73 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
74 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
75 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
76 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
77 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
78 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
79 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
80 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
81 "&ror_32 (@x[$d0],@x[$d0],16)",
82 "&ror_32 (@x[$d1],@x[$d1],16)",
83 "&ror_32 (@x[$d2],@x[$d2],16)",
84 "&ror_32 (@x[$d3],@x[$d3],16)",
85
86 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
87 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
88 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
89 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
90 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
91 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
92 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
93 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
94 "&ror_32 (@x[$b0],@x[$b0],20)",
95 "&ror_32 (@x[$b1],@x[$b1],20)",
96 "&ror_32 (@x[$b2],@x[$b2],20)",
97 "&ror_32 (@x[$b3],@x[$b3],20)",
98
99 "&add_32 (@x[$a0],@x[$a0],@x[$b0])",
100 "&add_32 (@x[$a1],@x[$a1],@x[$b1])",
101 "&add_32 (@x[$a2],@x[$a2],@x[$b2])",
102 "&add_32 (@x[$a3],@x[$a3],@x[$b3])",
103 "&eor_32 (@x[$d0],@x[$d0],@x[$a0])",
104 "&eor_32 (@x[$d1],@x[$d1],@x[$a1])",
105 "&eor_32 (@x[$d2],@x[$d2],@x[$a2])",
106 "&eor_32 (@x[$d3],@x[$d3],@x[$a3])",
107 "&ror_32 (@x[$d0],@x[$d0],24)",
108 "&ror_32 (@x[$d1],@x[$d1],24)",
109 "&ror_32 (@x[$d2],@x[$d2],24)",
110 "&ror_32 (@x[$d3],@x[$d3],24)",
111
112 "&add_32 (@x[$c0],@x[$c0],@x[$d0])",
113 "&add_32 (@x[$c1],@x[$c1],@x[$d1])",
114 "&add_32 (@x[$c2],@x[$c2],@x[$d2])",
115 "&add_32 (@x[$c3],@x[$c3],@x[$d3])",
116 "&eor_32 (@x[$b0],@x[$b0],@x[$c0])",
117 "&eor_32 (@x[$b1],@x[$b1],@x[$c1])",
118 "&eor_32 (@x[$b2],@x[$b2],@x[$c2])",
119 "&eor_32 (@x[$b3],@x[$b3],@x[$c3])",
120 "&ror_32 (@x[$b0],@x[$b0],25)",
121 "&ror_32 (@x[$b1],@x[$b1],25)",
122 "&ror_32 (@x[$b2],@x[$b2],25)",
123 "&ror_32 (@x[$b3],@x[$b3],25)"
124 );
125}
126
127$code.=<<___;
128#ifndef __KERNEL__
129# include "arm_arch.h"
130.extern OPENSSL_armcap_P
131#else
132# define ChaCha20_ctr32 chacha20_arm
133# define ChaCha20_neon chacha20_neon
134#endif
135
136.text
137
138.align 5
139.Lsigma:
140.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
141.Lone:
142.long 1,0,0,0
143#ifndef __KERNEL__
144.LOPENSSL_armcap_P:
145# ifdef __ILP32__
146.long OPENSSL_armcap_P-.
147# else
148.quad OPENSSL_armcap_P-.
149# endif
150#endif
151
152.globl ChaCha20_ctr32
153.type ChaCha20_ctr32,%function
154.align 5
155ChaCha20_ctr32:
156 cbz $len,.Labort
157#ifndef __KERNEL__
158 adr @x[0],.LOPENSSL_armcap_P
159 cmp $len,#192
160 b.lo .Lshort
161# ifdef __ILP32__
162 ldrsw @x[1],[@x[0]]
163# else
164 ldr @x[1],[@x[0]]
165# endif
166 ldr w17,[@x[1],@x[0]]
167 tst w17,#ARMV7_NEON
168 b.ne ChaCha20_neon
169
170.Lshort:
171#endif
172 stp x29,x30,[sp,#-96]!
173 add x29,sp,#0
174
175 adr @x[0],.Lsigma
176 stp x19,x20,[sp,#16]
177 stp x21,x22,[sp,#32]
178 stp x23,x24,[sp,#48]
179 stp x25,x26,[sp,#64]
180 stp x27,x28,[sp,#80]
181 sub sp,sp,#64
182
183 ldp @d[0],@d[1],[@x[0]] // load sigma
184 ldp @d[2],@d[3],[$key] // load key
185 ldp @d[4],@d[5],[$key,#16]
186 ldp @d[6],@d[7],[$ctr] // load counter
187#ifdef __AARCH64EB__
188 ror @d[2],@d[2],#32
189 ror @d[3],@d[3],#32
190 ror @d[4],@d[4],#32
191 ror @d[5],@d[5],#32
192 ror @d[6],@d[6],#32
193 ror @d[7],@d[7],#32
194#endif
195
196.Loop_outer:
197 mov.32 @x[0],@d[0] // unpack key block
198 lsr @x[1],@d[0],#32
199 mov.32 @x[2],@d[1]
200 lsr @x[3],@d[1],#32
201 mov.32 @x[4],@d[2]
202 lsr @x[5],@d[2],#32
203 mov.32 @x[6],@d[3]
204 lsr @x[7],@d[3],#32
205 mov.32 @x[8],@d[4]
206 lsr @x[9],@d[4],#32
207 mov.32 @x[10],@d[5]
208 lsr @x[11],@d[5],#32
209 mov.32 @x[12],@d[6]
210 lsr @x[13],@d[6],#32
211 mov.32 @x[14],@d[7]
212 lsr @x[15],@d[7],#32
213
214 mov $ctr,#10
215 subs $len,$len,#64
216.Loop:
217 sub $ctr,$ctr,#1
218___
219 foreach (&ROUND(0, 4, 8,12)) { eval; }
220 foreach (&ROUND(0, 5,10,15)) { eval; }
221$code.=<<___;
222 cbnz $ctr,.Loop
223
224 add.32 @x[0],@x[0],@d[0] // accumulate key block
225 add @x[1],@x[1],@d[0],lsr#32
226 add.32 @x[2],@x[2],@d[1]
227 add @x[3],@x[3],@d[1],lsr#32
228 add.32 @x[4],@x[4],@d[2]
229 add @x[5],@x[5],@d[2],lsr#32
230 add.32 @x[6],@x[6],@d[3]
231 add @x[7],@x[7],@d[3],lsr#32
232 add.32 @x[8],@x[8],@d[4]
233 add @x[9],@x[9],@d[4],lsr#32
234 add.32 @x[10],@x[10],@d[5]
235 add @x[11],@x[11],@d[5],lsr#32
236 add.32 @x[12],@x[12],@d[6]
237 add @x[13],@x[13],@d[6],lsr#32
238 add.32 @x[14],@x[14],@d[7]
239 add @x[15],@x[15],@d[7],lsr#32
240
241 b.lo .Ltail
242
243 add @x[0],@x[0],@x[1],lsl#32 // pack
244 add @x[2],@x[2],@x[3],lsl#32
245 ldp @x[1],@x[3],[$inp,#0] // load input
246 add @x[4],@x[4],@x[5],lsl#32
247 add @x[6],@x[6],@x[7],lsl#32
248 ldp @x[5],@x[7],[$inp,#16]
249 add @x[8],@x[8],@x[9],lsl#32
250 add @x[10],@x[10],@x[11],lsl#32
251 ldp @x[9],@x[11],[$inp,#32]
252 add @x[12],@x[12],@x[13],lsl#32
253 add @x[14],@x[14],@x[15],lsl#32
254 ldp @x[13],@x[15],[$inp,#48]
255 add $inp,$inp,#64
256#ifdef __AARCH64EB__
257 rev @x[0],@x[0]
258 rev @x[2],@x[2]
259 rev @x[4],@x[4]
260 rev @x[6],@x[6]
261 rev @x[8],@x[8]
262 rev @x[10],@x[10]
263 rev @x[12],@x[12]
264 rev @x[14],@x[14]
265#endif
266 eor @x[0],@x[0],@x[1]
267 eor @x[2],@x[2],@x[3]
268 eor @x[4],@x[4],@x[5]
269 eor @x[6],@x[6],@x[7]
270 eor @x[8],@x[8],@x[9]
271 eor @x[10],@x[10],@x[11]
272 eor @x[12],@x[12],@x[13]
273 eor @x[14],@x[14],@x[15]
274
275 stp @x[0],@x[2],[$out,#0] // store output
276 add @d[6],@d[6],#1 // increment counter
277 stp @x[4],@x[6],[$out,#16]
278 stp @x[8],@x[10],[$out,#32]
279 stp @x[12],@x[14],[$out,#48]
280 add $out,$out,#64
281
282 b.hi .Loop_outer
283
284 ldp x19,x20,[x29,#16]
285 add sp,sp,#64
286 ldp x21,x22,[x29,#32]
287 ldp x23,x24,[x29,#48]
288 ldp x25,x26,[x29,#64]
289 ldp x27,x28,[x29,#80]
290 ldp x29,x30,[sp],#96
291.Labort:
292 ret
293
294.align 4
295.Ltail:
296 add $len,$len,#64
297.Less_than_64:
298 sub $out,$out,#1
299 add $inp,$inp,$len
300 add $out,$out,$len
301 add $ctr,sp,$len
302 neg $len,$len
303
304 add @x[0],@x[0],@x[1],lsl#32 // pack
305 add @x[2],@x[2],@x[3],lsl#32
306 add @x[4],@x[4],@x[5],lsl#32
307 add @x[6],@x[6],@x[7],lsl#32
308 add @x[8],@x[8],@x[9],lsl#32
309 add @x[10],@x[10],@x[11],lsl#32
310 add @x[12],@x[12],@x[13],lsl#32
311 add @x[14],@x[14],@x[15],lsl#32
312#ifdef __AARCH64EB__
313 rev @x[0],@x[0]
314 rev @x[2],@x[2]
315 rev @x[4],@x[4]
316 rev @x[6],@x[6]
317 rev @x[8],@x[8]
318 rev @x[10],@x[10]
319 rev @x[12],@x[12]
320 rev @x[14],@x[14]
321#endif
322 stp @x[0],@x[2],[sp,#0]
323 stp @x[4],@x[6],[sp,#16]
324 stp @x[8],@x[10],[sp,#32]
325 stp @x[12],@x[14],[sp,#48]
326
327.Loop_tail:
328 ldrb w10,[$inp,$len]
329 ldrb w11,[$ctr,$len]
330 add $len,$len,#1
331 eor w10,w10,w11
332 strb w10,[$out,$len]
333 cbnz $len,.Loop_tail
334
335 stp xzr,xzr,[sp,#0]
336 stp xzr,xzr,[sp,#16]
337 stp xzr,xzr,[sp,#32]
338 stp xzr,xzr,[sp,#48]
339
340 ldp x19,x20,[x29,#16]
341 add sp,sp,#64
342 ldp x21,x22,[x29,#32]
343 ldp x23,x24,[x29,#48]
344 ldp x25,x26,[x29,#64]
345 ldp x27,x28,[x29,#80]
346 ldp x29,x30,[sp],#96
347 ret
348.size ChaCha20_ctr32,.-ChaCha20_ctr32
349___
350
351{{{
352my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
353 map("v$_.4s",(0..7,16..23));
354my (@K)=map("v$_.4s",(24..30));
355my $ONE="v31.4s";
356
357sub NEONROUND {
358my $odd = pop;
359my ($a,$b,$c,$d,$t)=@_;
360
361 (
362 "&add ('$a','$a','$b')",
363 "&eor ('$d','$d','$a')",
364 "&rev32_16 ('$d','$d')", # vrot ($d,16)
365
366 "&add ('$c','$c','$d')",
367 "&eor ('$t','$b','$c')",
368 "&ushr ('$b','$t',20)",
369 "&sli ('$b','$t',12)",
370
371 "&add ('$a','$a','$b')",
372 "&eor ('$t','$d','$a')",
373 "&ushr ('$d','$t',24)",
374 "&sli ('$d','$t',8)",
375
376 "&add ('$c','$c','$d')",
377 "&eor ('$t','$b','$c')",
378 "&ushr ('$b','$t',25)",
379 "&sli ('$b','$t',7)",
380
381 "&ext ('$a','$a','$a',$odd?4:12)",
382 "&ext ('$d','$d','$d',8)",
383 "&ext ('$c','$c','$c',$odd?12:4)"
384 );
385}
386
387$code.=<<___;
388#if !defined(__KERNEL__) || defined(CONFIG_KERNEL_MODE_NEON)
389#ifdef __KERNEL__
390.globl ChaCha20_neon
391.type ChaCha20_neon,%function
392#endif
393.type ChaCha20_neon,%function
394.align 5
395ChaCha20_neon:
396 stp x29,x30,[sp,#-96]!
397 add x29,sp,#0
398
399 adr @x[0],.Lsigma
400 stp x19,x20,[sp,#16]
401 stp x21,x22,[sp,#32]
402 stp x23,x24,[sp,#48]
403 stp x25,x26,[sp,#64]
404 stp x27,x28,[sp,#80]
405#ifdef __APPLE__
406 cmp $len,#512
407 b.hs .L512_or_more_neon
408#endif
409
410 sub sp,sp,#64
411
412 ldp @d[0],@d[1],[@x[0]] // load sigma
413 ld1 {@K[0]},[@x[0]],#16
414 ldp @d[2],@d[3],[$key] // load key
415 ldp @d[4],@d[5],[$key,#16]
416 ld1 {@K[1],@K[2]},[$key]
417 ldp @d[6],@d[7],[$ctr] // load counter
418 ld1 {@K[3]},[$ctr]
419 ld1 {$ONE},[@x[0]]
420#ifdef __AARCH64EB__
421 rev64 @K[0],@K[0]
422 ror @d[2],@d[2],#32
423 ror @d[3],@d[3],#32
424 ror @d[4],@d[4],#32
425 ror @d[5],@d[5],#32
426 ror @d[6],@d[6],#32
427 ror @d[7],@d[7],#32
428#endif
429 add @K[3],@K[3],$ONE // += 1
430 add @K[4],@K[3],$ONE
431 add @K[5],@K[4],$ONE
432 shl $ONE,$ONE,#2 // 1 -> 4
433
434.Loop_outer_neon:
435 mov.32 @x[0],@d[0] // unpack key block
436 lsr @x[1],@d[0],#32
437 mov $A0,@K[0]
438 mov.32 @x[2],@d[1]
439 lsr @x[3],@d[1],#32
440 mov $A1,@K[0]
441 mov.32 @x[4],@d[2]
442 lsr @x[5],@d[2],#32
443 mov $A2,@K[0]
444 mov.32 @x[6],@d[3]
445 mov $B0,@K[1]
446 lsr @x[7],@d[3],#32
447 mov $B1,@K[1]
448 mov.32 @x[8],@d[4]
449 mov $B2,@K[1]
450 lsr @x[9],@d[4],#32
451 mov $D0,@K[3]
452 mov.32 @x[10],@d[5]
453 mov $D1,@K[4]
454 lsr @x[11],@d[5],#32
455 mov $D2,@K[5]
456 mov.32 @x[12],@d[6]
457 mov $C0,@K[2]
458 lsr @x[13],@d[6],#32
459 mov $C1,@K[2]
460 mov.32 @x[14],@d[7]
461 mov $C2,@K[2]
462 lsr @x[15],@d[7],#32
463
464 mov $ctr,#10
465 subs $len,$len,#256
466.Loop_neon:
467 sub $ctr,$ctr,#1
468___
469 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
470 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
471 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
472 my @thread3=&ROUND(0,4,8,12);
473
474 foreach (@thread0) {
475 eval; eval(shift(@thread3));
476 eval(shift(@thread1)); eval(shift(@thread3));
477 eval(shift(@thread2)); eval(shift(@thread3));
478 }
479
480 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
481 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
482 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
483 @thread3=&ROUND(0,5,10,15);
484
485 foreach (@thread0) {
486 eval; eval(shift(@thread3));
487 eval(shift(@thread1)); eval(shift(@thread3));
488 eval(shift(@thread2)); eval(shift(@thread3));
489 }
490$code.=<<___;
491 cbnz $ctr,.Loop_neon
492
493 add.32 @x[0],@x[0],@d[0] // accumulate key block
494 add $A0,$A0,@K[0]
495 add @x[1],@x[1],@d[0],lsr#32
496 add $A1,$A1,@K[0]
497 add.32 @x[2],@x[2],@d[1]
498 add $A2,$A2,@K[0]
499 add @x[3],@x[3],@d[1],lsr#32
500 add $C0,$C0,@K[2]
501 add.32 @x[4],@x[4],@d[2]
502 add $C1,$C1,@K[2]
503 add @x[5],@x[5],@d[2],lsr#32
504 add $C2,$C2,@K[2]
505 add.32 @x[6],@x[6],@d[3]
506 add $D0,$D0,@K[3]
507 add @x[7],@x[7],@d[3],lsr#32
508 add.32 @x[8],@x[8],@d[4]
509 add $D1,$D1,@K[4]
510 add @x[9],@x[9],@d[4],lsr#32
511 add.32 @x[10],@x[10],@d[5]
512 add $D2,$D2,@K[5]
513 add @x[11],@x[11],@d[5],lsr#32
514 add.32 @x[12],@x[12],@d[6]
515 add $B0,$B0,@K[1]
516 add @x[13],@x[13],@d[6],lsr#32
517 add.32 @x[14],@x[14],@d[7]
518 add $B1,$B1,@K[1]
519 add @x[15],@x[15],@d[7],lsr#32
520 add $B2,$B2,@K[1]
521
522 b.lo .Ltail_neon
523
524 add @x[0],@x[0],@x[1],lsl#32 // pack
525 add @x[2],@x[2],@x[3],lsl#32
526 ldp @x[1],@x[3],[$inp,#0] // load input
527 add @x[4],@x[4],@x[5],lsl#32
528 add @x[6],@x[6],@x[7],lsl#32
529 ldp @x[5],@x[7],[$inp,#16]
530 add @x[8],@x[8],@x[9],lsl#32
531 add @x[10],@x[10],@x[11],lsl#32
532 ldp @x[9],@x[11],[$inp,#32]
533 add @x[12],@x[12],@x[13],lsl#32
534 add @x[14],@x[14],@x[15],lsl#32
535 ldp @x[13],@x[15],[$inp,#48]
536 add $inp,$inp,#64
537#ifdef __AARCH64EB__
538 rev @x[0],@x[0]
539 rev @x[2],@x[2]
540 rev @x[4],@x[4]
541 rev @x[6],@x[6]
542 rev @x[8],@x[8]
543 rev @x[10],@x[10]
544 rev @x[12],@x[12]
545 rev @x[14],@x[14]
546#endif
547 ld1.8 {$T0-$T3},[$inp],#64
548 eor @x[0],@x[0],@x[1]
549 eor @x[2],@x[2],@x[3]
550 eor @x[4],@x[4],@x[5]
551 eor @x[6],@x[6],@x[7]
552 eor @x[8],@x[8],@x[9]
553 eor $A0,$A0,$T0
554 eor @x[10],@x[10],@x[11]
555 eor $B0,$B0,$T1
556 eor @x[12],@x[12],@x[13]
557 eor $C0,$C0,$T2
558 eor @x[14],@x[14],@x[15]
559 eor $D0,$D0,$T3
560 ld1.8 {$T0-$T3},[$inp],#64
561
562 stp @x[0],@x[2],[$out,#0] // store output
563 add @d[6],@d[6],#4 // increment counter
564 stp @x[4],@x[6],[$out,#16]
565 add @K[3],@K[3],$ONE // += 4
566 stp @x[8],@x[10],[$out,#32]
567 add @K[4],@K[4],$ONE
568 stp @x[12],@x[14],[$out,#48]
569 add @K[5],@K[5],$ONE
570 add $out,$out,#64
571
572 st1.8 {$A0-$D0},[$out],#64
573 ld1.8 {$A0-$D0},[$inp],#64
574
575 eor $A1,$A1,$T0
576 eor $B1,$B1,$T1
577 eor $C1,$C1,$T2
578 eor $D1,$D1,$T3
579 st1.8 {$A1-$D1},[$out],#64
580
581 eor $A2,$A2,$A0
582 eor $B2,$B2,$B0
583 eor $C2,$C2,$C0
584 eor $D2,$D2,$D0
585 st1.8 {$A2-$D2},[$out],#64
586
587 b.hi .Loop_outer_neon
588
589 ldp x19,x20,[x29,#16]
590 add sp,sp,#64
591 ldp x21,x22,[x29,#32]
592 ldp x23,x24,[x29,#48]
593 ldp x25,x26,[x29,#64]
594 ldp x27,x28,[x29,#80]
595 ldp x29,x30,[sp],#96
596 ret
597
598.Ltail_neon:
599 add $len,$len,#256
600 cmp $len,#64
601 b.lo .Less_than_64
602
603 add @x[0],@x[0],@x[1],lsl#32 // pack
604 add @x[2],@x[2],@x[3],lsl#32
605 ldp @x[1],@x[3],[$inp,#0] // load input
606 add @x[4],@x[4],@x[5],lsl#32
607 add @x[6],@x[6],@x[7],lsl#32
608 ldp @x[5],@x[7],[$inp,#16]
609 add @x[8],@x[8],@x[9],lsl#32
610 add @x[10],@x[10],@x[11],lsl#32
611 ldp @x[9],@x[11],[$inp,#32]
612 add @x[12],@x[12],@x[13],lsl#32
613 add @x[14],@x[14],@x[15],lsl#32
614 ldp @x[13],@x[15],[$inp,#48]
615 add $inp,$inp,#64
616#ifdef __AARCH64EB__
617 rev @x[0],@x[0]
618 rev @x[2],@x[2]
619 rev @x[4],@x[4]
620 rev @x[6],@x[6]
621 rev @x[8],@x[8]
622 rev @x[10],@x[10]
623 rev @x[12],@x[12]
624 rev @x[14],@x[14]
625#endif
626 eor @x[0],@x[0],@x[1]
627 eor @x[2],@x[2],@x[3]
628 eor @x[4],@x[4],@x[5]
629 eor @x[6],@x[6],@x[7]
630 eor @x[8],@x[8],@x[9]
631 eor @x[10],@x[10],@x[11]
632 eor @x[12],@x[12],@x[13]
633 eor @x[14],@x[14],@x[15]
634
635 stp @x[0],@x[2],[$out,#0] // store output
636 add @d[6],@d[6],#4 // increment counter
637 stp @x[4],@x[6],[$out,#16]
638 stp @x[8],@x[10],[$out,#32]
639 stp @x[12],@x[14],[$out,#48]
640 add $out,$out,#64
641 b.eq .Ldone_neon
642 sub $len,$len,#64
643 cmp $len,#64
644 b.lo .Less_than_128
645
646 ld1.8 {$T0-$T3},[$inp],#64
647 eor $A0,$A0,$T0
648 eor $B0,$B0,$T1
649 eor $C0,$C0,$T2
650 eor $D0,$D0,$T3
651 st1.8 {$A0-$D0},[$out],#64
652 b.eq .Ldone_neon
653 sub $len,$len,#64
654 cmp $len,#64
655 b.lo .Less_than_192
656
657 ld1.8 {$T0-$T3},[$inp],#64
658 eor $A1,$A1,$T0
659 eor $B1,$B1,$T1
660 eor $C1,$C1,$T2
661 eor $D1,$D1,$T3
662 st1.8 {$A1-$D1},[$out],#64
663 b.eq .Ldone_neon
664 sub $len,$len,#64
665
666 st1.8 {$A2-$D2},[sp]
667 b .Last_neon
668
669.Less_than_128:
670 st1.8 {$A0-$D0},[sp]
671 b .Last_neon
672.Less_than_192:
673 st1.8 {$A1-$D1},[sp]
674 b .Last_neon
675
676.align 4
677.Last_neon:
678 sub $out,$out,#1
679 add $inp,$inp,$len
680 add $out,$out,$len
681 add $ctr,sp,$len
682 neg $len,$len
683
684.Loop_tail_neon:
685 ldrb w10,[$inp,$len]
686 ldrb w11,[$ctr,$len]
687 add $len,$len,#1
688 eor w10,w10,w11
689 strb w10,[$out,$len]
690 cbnz $len,.Loop_tail_neon
691
692 stp xzr,xzr,[sp,#0]
693 stp xzr,xzr,[sp,#16]
694 stp xzr,xzr,[sp,#32]
695 stp xzr,xzr,[sp,#48]
696
697.Ldone_neon:
698 ldp x19,x20,[x29,#16]
699 add sp,sp,#64
700 ldp x21,x22,[x29,#32]
701 ldp x23,x24,[x29,#48]
702 ldp x25,x26,[x29,#64]
703 ldp x27,x28,[x29,#80]
704 ldp x29,x30,[sp],#96
705 ret
706.size ChaCha20_neon,.-ChaCha20_neon
707___
708{
709my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
710my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
711 $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
712
713$code.=<<___;
714#ifdef __APPLE__
715.type ChaCha20_512_neon,%function
716.align 5
717ChaCha20_512_neon:
718 stp x29,x30,[sp,#-96]!
719 add x29,sp,#0
720
721 adr @x[0],.Lsigma
722 stp x19,x20,[sp,#16]
723 stp x21,x22,[sp,#32]
724 stp x23,x24,[sp,#48]
725 stp x25,x26,[sp,#64]
726 stp x27,x28,[sp,#80]
727
728.L512_or_more_neon:
729 sub sp,sp,#128+64
730
731 ldp @d[0],@d[1],[@x[0]] // load sigma
732 ld1 {@K[0]},[@x[0]],#16
733 ldp @d[2],@d[3],[$key] // load key
734 ldp @d[4],@d[5],[$key,#16]
735 ld1 {@K[1],@K[2]},[$key]
736 ldp @d[6],@d[7],[$ctr] // load counter
737 ld1 {@K[3]},[$ctr]
738 ld1 {$ONE},[@x[0]]
739# ifdef __AARCH64EB__
740 rev64 @K[0],@K[0]
741 ror @d[2],@d[2],#32
742 ror @d[3],@d[3],#32
743 ror @d[4],@d[4],#32
744 ror @d[5],@d[5],#32
745 ror @d[6],@d[6],#32
746 ror @d[7],@d[7],#32
747# endif
748 add @K[3],@K[3],$ONE // += 1
749 stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part
750 add @K[3],@K[3],$ONE // not typo
751 str @K[2],[sp,#32]
752 add @K[4],@K[3],$ONE
753 add @K[5],@K[4],$ONE
754 add @K[6],@K[5],$ONE
755 shl $ONE,$ONE,#2 // 1 -> 4
756
757 stp d8,d9,[sp,#128+0] // meet ABI requirements
758 stp d10,d11,[sp,#128+16]
759 stp d12,d13,[sp,#128+32]
760 stp d14,d15,[sp,#128+48]
761
762 sub $len,$len,#512 // not typo
763
764.Loop_outer_512_neon:
765 mov $A0,@K[0]
766 mov $A1,@K[0]
767 mov $A2,@K[0]
768 mov $A3,@K[0]
769 mov $A4,@K[0]
770 mov $A5,@K[0]
771 mov $B0,@K[1]
772 mov.32 @x[0],@d[0] // unpack key block
773 mov $B1,@K[1]
774 lsr @x[1],@d[0],#32
775 mov $B2,@K[1]
776 mov.32 @x[2],@d[1]
777 mov $B3,@K[1]
778 lsr @x[3],@d[1],#32
779 mov $B4,@K[1]
780 mov.32 @x[4],@d[2]
781 mov $B5,@K[1]
782 lsr @x[5],@d[2],#32
783 mov $D0,@K[3]
784 mov.32 @x[6],@d[3]
785 mov $D1,@K[4]
786 lsr @x[7],@d[3],#32
787 mov $D2,@K[5]
788 mov.32 @x[8],@d[4]
789 mov $D3,@K[6]
790 lsr @x[9],@d[4],#32
791 mov $C0,@K[2]
792 mov.32 @x[10],@d[5]
793 mov $C1,@K[2]
794 lsr @x[11],@d[5],#32
795 add $D4,$D0,$ONE // +4
796 mov.32 @x[12],@d[6]
797 add $D5,$D1,$ONE // +4
798 lsr @x[13],@d[6],#32
799 mov $C2,@K[2]
800 mov.32 @x[14],@d[7]
801 mov $C3,@K[2]
802 lsr @x[15],@d[7],#32
803 mov $C4,@K[2]
804 stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
805 mov $C5,@K[2]
806 str @K[5],[sp,#80]
807
808 mov $ctr,#5
809 subs $len,$len,#512
810.Loop_upper_neon:
811 sub $ctr,$ctr,#1
812___
813 my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
814 my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
815 my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
816 my @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
817 my @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
818 my @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
819 my @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
820 my $diff = ($#thread0+1)*6 - $#thread67 - 1;
821 my $i = 0;
822
823 foreach (@thread0) {
824 eval; eval(shift(@thread67));
825 eval(shift(@thread1)); eval(shift(@thread67));
826 eval(shift(@thread2)); eval(shift(@thread67));
827 eval(shift(@thread3)); eval(shift(@thread67));
828 eval(shift(@thread4)); eval(shift(@thread67));
829 eval(shift(@thread5)); eval(shift(@thread67));
830 }
831
832 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
833 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
834 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
835 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
836 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
837 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
838 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
839
840 foreach (@thread0) {
841 eval; eval(shift(@thread67));
842 eval(shift(@thread1)); eval(shift(@thread67));
843 eval(shift(@thread2)); eval(shift(@thread67));
844 eval(shift(@thread3)); eval(shift(@thread67));
845 eval(shift(@thread4)); eval(shift(@thread67));
846 eval(shift(@thread5)); eval(shift(@thread67));
847 }
848$code.=<<___;
849 cbnz $ctr,.Loop_upper_neon
850
851 add.32 @x[0],@x[0],@d[0] // accumulate key block
852 add @x[1],@x[1],@d[0],lsr#32
853 add.32 @x[2],@x[2],@d[1]
854 add @x[3],@x[3],@d[1],lsr#32
855 add.32 @x[4],@x[4],@d[2]
856 add @x[5],@x[5],@d[2],lsr#32
857 add.32 @x[6],@x[6],@d[3]
858 add @x[7],@x[7],@d[3],lsr#32
859 add.32 @x[8],@x[8],@d[4]
860 add @x[9],@x[9],@d[4],lsr#32
861 add.32 @x[10],@x[10],@d[5]
862 add @x[11],@x[11],@d[5],lsr#32
863 add.32 @x[12],@x[12],@d[6]
864 add @x[13],@x[13],@d[6],lsr#32
865 add.32 @x[14],@x[14],@d[7]
866 add @x[15],@x[15],@d[7],lsr#32
867
868 add @x[0],@x[0],@x[1],lsl#32 // pack
869 add @x[2],@x[2],@x[3],lsl#32
870 ldp @x[1],@x[3],[$inp,#0] // load input
871 add @x[4],@x[4],@x[5],lsl#32
872 add @x[6],@x[6],@x[7],lsl#32
873 ldp @x[5],@x[7],[$inp,#16]
874 add @x[8],@x[8],@x[9],lsl#32
875 add @x[10],@x[10],@x[11],lsl#32
876 ldp @x[9],@x[11],[$inp,#32]
877 add @x[12],@x[12],@x[13],lsl#32
878 add @x[14],@x[14],@x[15],lsl#32
879 ldp @x[13],@x[15],[$inp,#48]
880 add $inp,$inp,#64
881# ifdef __AARCH64EB__
882 rev @x[0],@x[0]
883 rev @x[2],@x[2]
884 rev @x[4],@x[4]
885 rev @x[6],@x[6]
886 rev @x[8],@x[8]
887 rev @x[10],@x[10]
888 rev @x[12],@x[12]
889 rev @x[14],@x[14]
890# endif
891 eor @x[0],@x[0],@x[1]
892 eor @x[2],@x[2],@x[3]
893 eor @x[4],@x[4],@x[5]
894 eor @x[6],@x[6],@x[7]
895 eor @x[8],@x[8],@x[9]
896 eor @x[10],@x[10],@x[11]
897 eor @x[12],@x[12],@x[13]
898 eor @x[14],@x[14],@x[15]
899
900 stp @x[0],@x[2],[$out,#0] // store output
901 add @d[6],@d[6],#1 // increment counter
902 mov.32 @x[0],@d[0] // unpack key block
903 lsr @x[1],@d[0],#32
904 stp @x[4],@x[6],[$out,#16]
905 mov.32 @x[2],@d[1]
906 lsr @x[3],@d[1],#32
907 stp @x[8],@x[10],[$out,#32]
908 mov.32 @x[4],@d[2]
909 lsr @x[5],@d[2],#32
910 stp @x[12],@x[14],[$out,#48]
911 add $out,$out,#64
912 mov.32 @x[6],@d[3]
913 lsr @x[7],@d[3],#32
914 mov.32 @x[8],@d[4]
915 lsr @x[9],@d[4],#32
916 mov.32 @x[10],@d[5]
917 lsr @x[11],@d[5],#32
918 mov.32 @x[12],@d[6]
919 lsr @x[13],@d[6],#32
920 mov.32 @x[14],@d[7]
921 lsr @x[15],@d[7],#32
922
923 mov $ctr,#5
924.Loop_lower_neon:
925 sub $ctr,$ctr,#1
926___
927 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
928 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
929 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
930 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,0);
931 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,0);
932 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,0);
933 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
934
935 foreach (@thread0) {
936 eval; eval(shift(@thread67));
937 eval(shift(@thread1)); eval(shift(@thread67));
938 eval(shift(@thread2)); eval(shift(@thread67));
939 eval(shift(@thread3)); eval(shift(@thread67));
940 eval(shift(@thread4)); eval(shift(@thread67));
941 eval(shift(@thread5)); eval(shift(@thread67));
942 }
943
944 @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
945 @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
946 @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
947 @thread3=&NEONROUND($A3,$B3,$C3,$D3,$T3,1);
948 @thread4=&NEONROUND($A4,$B4,$C4,$D4,$T4,1);
949 @thread5=&NEONROUND($A5,$B5,$C5,$D5,$T5,1);
950 @thread67=(&ROUND(0,4,8,12),&ROUND(0,5,10,15));
951
952 foreach (@thread0) {
953 eval; eval(shift(@thread67));
954 eval(shift(@thread1)); eval(shift(@thread67));
955 eval(shift(@thread2)); eval(shift(@thread67));
956 eval(shift(@thread3)); eval(shift(@thread67));
957 eval(shift(@thread4)); eval(shift(@thread67));
958 eval(shift(@thread5)); eval(shift(@thread67));
959 }
960$code.=<<___;
961 cbnz $ctr,.Loop_lower_neon
962
963 add.32 @x[0],@x[0],@d[0] // accumulate key block
964 ldp @K[0],@K[1],[sp,#0]
965 add @x[1],@x[1],@d[0],lsr#32
966 ldp @K[2],@K[3],[sp,#32]
967 add.32 @x[2],@x[2],@d[1]
968 ldp @K[4],@K[5],[sp,#64]
969 add @x[3],@x[3],@d[1],lsr#32
970 add $A0,$A0,@K[0]
971 add.32 @x[4],@x[4],@d[2]
972 add $A1,$A1,@K[0]
973 add @x[5],@x[5],@d[2],lsr#32
974 add $A2,$A2,@K[0]
975 add.32 @x[6],@x[6],@d[3]
976 add $A3,$A3,@K[0]
977 add @x[7],@x[7],@d[3],lsr#32
978 add $A4,$A4,@K[0]
979 add.32 @x[8],@x[8],@d[4]
980 add $A5,$A5,@K[0]
981 add @x[9],@x[9],@d[4],lsr#32
982 add $C0,$C0,@K[2]
983 add.32 @x[10],@x[10],@d[5]
984 add $C1,$C1,@K[2]
985 add @x[11],@x[11],@d[5],lsr#32
986 add $C2,$C2,@K[2]
987 add.32 @x[12],@x[12],@d[6]
988 add $C3,$C3,@K[2]
989 add @x[13],@x[13],@d[6],lsr#32
990 add $C4,$C4,@K[2]
991 add.32 @x[14],@x[14],@d[7]
992 add $C5,$C5,@K[2]
993 add @x[15],@x[15],@d[7],lsr#32
994 add $D4,$D4,$ONE // +4
995 add @x[0],@x[0],@x[1],lsl#32 // pack
996 add $D5,$D5,$ONE // +4
997 add @x[2],@x[2],@x[3],lsl#32
998 add $D0,$D0,@K[3]
999 ldp @x[1],@x[3],[$inp,#0] // load input
1000 add $D1,$D1,@K[4]
1001 add @x[4],@x[4],@x[5],lsl#32
1002 add $D2,$D2,@K[5]
1003 add @x[6],@x[6],@x[7],lsl#32
1004 add $D3,$D3,@K[6]
1005 ldp @x[5],@x[7],[$inp,#16]
1006 add $D4,$D4,@K[3]
1007 add @x[8],@x[8],@x[9],lsl#32
1008 add $D5,$D5,@K[4]
1009 add @x[10],@x[10],@x[11],lsl#32
1010 add $B0,$B0,@K[1]
1011 ldp @x[9],@x[11],[$inp,#32]
1012 add $B1,$B1,@K[1]
1013 add @x[12],@x[12],@x[13],lsl#32
1014 add $B2,$B2,@K[1]
1015 add @x[14],@x[14],@x[15],lsl#32
1016 add $B3,$B3,@K[1]
1017 ldp @x[13],@x[15],[$inp,#48]
1018 add $B4,$B4,@K[1]
1019 add $inp,$inp,#64
1020 add $B5,$B5,@K[1]
1021
1022# ifdef __AARCH64EB__
1023 rev @x[0],@x[0]
1024 rev @x[2],@x[2]
1025 rev @x[4],@x[4]
1026 rev @x[6],@x[6]
1027 rev @x[8],@x[8]
1028 rev @x[10],@x[10]
1029 rev @x[12],@x[12]
1030 rev @x[14],@x[14]
1031# endif
1032 ld1.8 {$T0-$T3},[$inp],#64
1033 eor @x[0],@x[0],@x[1]
1034 eor @x[2],@x[2],@x[3]
1035 eor @x[4],@x[4],@x[5]
1036 eor @x[6],@x[6],@x[7]
1037 eor @x[8],@x[8],@x[9]
1038 eor $A0,$A0,$T0
1039 eor @x[10],@x[10],@x[11]
1040 eor $B0,$B0,$T1
1041 eor @x[12],@x[12],@x[13]
1042 eor $C0,$C0,$T2
1043 eor @x[14],@x[14],@x[15]
1044 eor $D0,$D0,$T3
1045 ld1.8 {$T0-$T3},[$inp],#64
1046
1047 stp @x[0],@x[2],[$out,#0] // store output
1048 add @d[6],@d[6],#7 // increment counter
1049 stp @x[4],@x[6],[$out,#16]
1050 stp @x[8],@x[10],[$out,#32]
1051 stp @x[12],@x[14],[$out,#48]
1052 add $out,$out,#64
1053 st1.8 {$A0-$D0},[$out],#64
1054
1055 ld1.8 {$A0-$D0},[$inp],#64
1056 eor $A1,$A1,$T0
1057 eor $B1,$B1,$T1
1058 eor $C1,$C1,$T2
1059 eor $D1,$D1,$T3
1060 st1.8 {$A1-$D1},[$out],#64
1061
1062 ld1.8 {$A1-$D1},[$inp],#64
1063 eor $A2,$A2,$A0
1064 ldp @K[0],@K[1],[sp,#0]
1065 eor $B2,$B2,$B0
1066 ldp @K[2],@K[3],[sp,#32]
1067 eor $C2,$C2,$C0
1068 eor $D2,$D2,$D0
1069 st1.8 {$A2-$D2},[$out],#64
1070
1071 ld1.8 {$A2-$D2},[$inp],#64
1072 eor $A3,$A3,$A1
1073 eor $B3,$B3,$B1
1074 eor $C3,$C3,$C1
1075 eor $D3,$D3,$D1
1076 st1.8 {$A3-$D3},[$out],#64
1077
1078 ld1.8 {$A3-$D3},[$inp],#64
1079 eor $A4,$A4,$A2
1080 eor $B4,$B4,$B2
1081 eor $C4,$C4,$C2
1082 eor $D4,$D4,$D2
1083 st1.8 {$A4-$D4},[$out],#64
1084
1085 shl $A0,$ONE,#1 // 4 -> 8
1086 eor $A5,$A5,$A3
1087 eor $B5,$B5,$B3
1088 eor $C5,$C5,$C3
1089 eor $D5,$D5,$D3
1090 st1.8 {$A5-$D5},[$out],#64
1091
1092 add @K[3],@K[3],$A0 // += 8
1093 add @K[4],@K[4],$A0
1094 add @K[5],@K[5],$A0
1095 add @K[6],@K[6],$A0
1096
1097 b.hs .Loop_outer_512_neon
1098
1099 adds $len,$len,#512
1100 ushr $A0,$ONE,#2 // 4 -> 1
1101
1102 ldp d8,d9,[sp,#128+0] // meet ABI requirements
1103 ldp d10,d11,[sp,#128+16]
1104 ldp d12,d13,[sp,#128+32]
1105 ldp d14,d15,[sp,#128+48]
1106
1107 stp @K[0],$ONE,[sp,#0] // wipe off-load area
1108 stp @K[0],$ONE,[sp,#32]
1109 stp @K[0],$ONE,[sp,#64]
1110
1111 b.eq .Ldone_512_neon
1112
1113 cmp $len,#192
1114 sub @K[3],@K[3],$A0 // -= 1
1115 sub @K[4],@K[4],$A0
1116 sub @K[5],@K[5],$A0
1117 add sp,sp,#128
1118 b.hs .Loop_outer_neon
1119
1120 eor @K[1],@K[1],@K[1]
1121 eor @K[2],@K[2],@K[2]
1122 eor @K[3],@K[3],@K[3]
1123 eor @K[4],@K[4],@K[4]
1124 eor @K[5],@K[5],@K[5]
1125 eor @K[6],@K[6],@K[6]
1126 b .Loop_outer
1127
1128.Ldone_512_neon:
1129 ldp x19,x20,[x29,#16]
1130 add sp,sp,#128+64
1131 ldp x21,x22,[x29,#32]
1132 ldp x23,x24,[x29,#48]
1133 ldp x25,x26,[x29,#64]
1134 ldp x27,x28,[x29,#80]
1135 ldp x29,x30,[sp],#96
1136 ret
1137.size ChaCha20_512_neon,.-ChaCha20_512_neon
1138#endif
1139#endif
1140___
1141}
1142}}}
1143
1144open SELF,$0;
1145while(<SELF>) {
1146 next if (/^#!/);
1147 last if (!s/^#/\/\// and !/^$/);
1148 print;
1149}
1150close SELF;
1151
1152foreach (split("\n",$code)) {
1153 s/\`([^\`]*)\`/eval $1/geo;
1154
1155 (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
1156 (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
1157 (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
1158 (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
1159 (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
1160
1161 print $_,"\n";
1162}
1163close STDOUT; # flush