Blame - net/wireguard/crypto/zinc/chacha20/chacha20-unrolled-arm.S - codeaurora/cp-linux

blob: 8fb4bc2e7b5b2993b5270b2f03865893fade698d [file] [log] [blame]

Kyle Swenson	7d38e03	2023-07-10 11:16:56 -0600	[diff] [blame^]	1	/* SPDX-License-Identifier: GPL-2.0 */
				2	/*
				3	* Copyright (C) 2018 Google, Inc.
				4	*/
				5
				6	#include <linux/linkage.h>
				7	#include <asm/assembler.h>
				8
				9	/*
				10	* Design notes:
				11	*
				12	* 16 registers would be needed to hold the state matrix, but only 14 are
				13	* available because 'sp' and 'pc' cannot be used. So we spill the elements
				14	* (x8, x9) to the stack and swap them out with (x10, x11). This adds one
				15	* 'ldrd' and one 'strd' instruction per round.
				16	*
				17	* All rotates are performed using the implicit rotate operand accepted by the
				18	* 'add' and 'eor' instructions. This is faster than using explicit rotate
				19	* instructions. To make this work, we allow the values in the second and last
				20	* rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
				21	* wrong rotation amount. The rotation amount is then fixed up just in time
				22	* when the values are used. 'brot' is the number of bits the values in row 'b'
				23	* need to be rotated right to arrive at the correct values, and 'drot'
				24	* similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
				25	* that they end up as (25, 24) after every round.
				26	*/
				27
				28	// ChaCha state registers
				29	X0 .req r0
				30	X1 .req r1
				31	X2 .req r2
				32	X3 .req r3
				33	X4 .req r4
				34	X5 .req r5
				35	X6 .req r6
				36	X7 .req r7
				37	X8_X10 .req r8 // shared by x8 and x10
				38	X9_X11 .req r9 // shared by x9 and x11
				39	X12 .req r10
				40	X13 .req r11
				41	X14 .req r12
				42	X15 .req r14
				43
				44	.Lexpand_32byte_k:
				45	// "expand 32-byte k"
				46	.word 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
				47
				48	#ifdef __thumb2__
				49	# define adrl adr
				50	#endif
				51
				52	.macro __rev out, in, t0, t1, t2
				53	.if __LINUX_ARM_ARCH__ >= 6
				54	rev \out, \in
				55	.else
				56	lsl \t0, \in, #24
				57	and \t1, \in, #0xff00
				58	and \t2, \in, #0xff0000
				59	orr \out, \t0, \in, lsr #24
				60	orr \out, \out, \t1, lsl #8
				61	orr \out, \out, \t2, lsr #8
				62	.endif
				63	.endm
				64
				65	.macro _le32_bswap x, t0, t1, t2
				66	#ifdef __ARMEB__
				67	__rev \x, \x, \t0, \t1, \t2
				68	#endif
				69	.endm
				70
				71	.macro _le32_bswap_4x a, b, c, d, t0, t1, t2
				72	_le32_bswap \a, \t0, \t1, \t2
				73	_le32_bswap \b, \t0, \t1, \t2
				74	_le32_bswap \c, \t0, \t1, \t2
				75	_le32_bswap \d, \t0, \t1, \t2
				76	.endm
				77
				78	.macro __ldrd a, b, src, offset
				79	#if __LINUX_ARM_ARCH__ >= 6
				80	ldrd \a, \b, [\src, #\offset]
				81	#else
				82	ldr \a, [\src, #\offset]
				83	ldr \b, [\src, #\offset + 4]
				84	#endif
				85	.endm
				86
				87	.macro __strd a, b, dst, offset
				88	#if __LINUX_ARM_ARCH__ >= 6
				89	strd \a, \b, [\dst, #\offset]
				90	#else
				91	str \a, [\dst, #\offset]
				92	str \b, [\dst, #\offset + 4]
				93	#endif
				94	.endm
				95
				96	.macro _halfround a1, b1, c1, d1, a2, b2, c2, d2
				97
				98	// a += b; d ^= a; d = rol(d, 16);
				99	add \a1, \a1, \b1, ror #brot
				100	add \a2, \a2, \b2, ror #brot
				101	eor \d1, \a1, \d1, ror #drot
				102	eor \d2, \a2, \d2, ror #drot
				103	// drot == 32 - 16 == 16
				104
				105	// c += d; b ^= c; b = rol(b, 12);
				106	add \c1, \c1, \d1, ror #16
				107	add \c2, \c2, \d2, ror #16
				108	eor \b1, \c1, \b1, ror #brot
				109	eor \b2, \c2, \b2, ror #brot
				110	// brot == 32 - 12 == 20
				111
				112	// a += b; d ^= a; d = rol(d, 8);
				113	add \a1, \a1, \b1, ror #20
				114	add \a2, \a2, \b2, ror #20
				115	eor \d1, \a1, \d1, ror #16
				116	eor \d2, \a2, \d2, ror #16
				117	// drot == 32 - 8 == 24
				118
				119	// c += d; b ^= c; b = rol(b, 7);
				120	add \c1, \c1, \d1, ror #24
				121	add \c2, \c2, \d2, ror #24
				122	eor \b1, \c1, \b1, ror #20
				123	eor \b2, \c2, \b2, ror #20
				124	// brot == 32 - 7 == 25
				125	.endm
				126
				127	.macro _doubleround
				128
				129	// column round
				130
				131	// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
				132	_halfround X0, X4, X8_X10, X12, X1, X5, X9_X11, X13
				133
				134	// save (x8, x9); restore (x10, x11)
				135	__strd X8_X10, X9_X11, sp, 0
				136	__ldrd X8_X10, X9_X11, sp, 8
				137
				138	// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
				139	_halfround X2, X6, X8_X10, X14, X3, X7, X9_X11, X15
				140
				141	.set brot, 25
				142	.set drot, 24
				143
				144	// diagonal round
				145
				146	// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
				147	_halfround X0, X5, X8_X10, X15, X1, X6, X9_X11, X12
				148
				149	// save (x10, x11); restore (x8, x9)
				150	__strd X8_X10, X9_X11, sp, 8
				151	__ldrd X8_X10, X9_X11, sp, 0
				152
				153	// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
				154	_halfround X2, X7, X8_X10, X13, X3, X4, X9_X11, X14
				155	.endm
				156
				157	.macro _chacha_permute nrounds
				158	.set brot, 0
				159	.set drot, 0
				160	.rept \nrounds / 2
				161	_doubleround
				162	.endr
				163	.endm
				164
				165	.macro _chacha nrounds
				166
				167	.Lnext_block\@:
				168	// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
				169	// Registers contain x0-x9,x12-x15.
				170
				171	// Do the core ChaCha permutation to update x0-x15.
				172	_chacha_permute \nrounds
				173
				174	add sp, #8
				175	// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
				176	// Registers contain x0-x9,x12-x15.
				177	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
				178
				179	// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
				180	push {X8_X10, X9_X11, X12, X13, X14, X15}
				181
				182	// Load (OUT, IN, LEN).
				183	ldr r14, [sp, #96]
				184	ldr r12, [sp, #100]
				185	ldr r11, [sp, #104]
				186
				187	orr r10, r14, r12
				188
				189	// Use slow path if fewer than 64 bytes remain.
				190	cmp r11, #64
				191	blt .Lxor_slowpath\@
				192
				193	// Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
				194	// ARMv6+, since ldmia and stmia (used below) still require alignment.
				195	tst r10, #3
				196	bne .Lxor_slowpath\@
				197
				198	// Fast path: XOR 64 bytes of aligned data.
				199
				200	// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
				201	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
				202	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
				203
				204	// x0-x3
				205	__ldrd r8, r9, sp, 32
				206	__ldrd r10, r11, sp, 40
				207	add X0, X0, r8
				208	add X1, X1, r9
				209	add X2, X2, r10
				210	add X3, X3, r11
				211	_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
				212	ldmia r12!, {r8-r11}
				213	eor X0, X0, r8
				214	eor X1, X1, r9
				215	eor X2, X2, r10
				216	eor X3, X3, r11
				217	stmia r14!, {X0-X3}
				218
				219	// x4-x7
				220	__ldrd r8, r9, sp, 48
				221	__ldrd r10, r11, sp, 56
				222	add X4, r8, X4, ror #brot
				223	add X5, r9, X5, ror #brot
				224	ldmia r12!, {X0-X3}
				225	add X6, r10, X6, ror #brot
				226	add X7, r11, X7, ror #brot
				227	_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
				228	eor X4, X4, X0
				229	eor X5, X5, X1
				230	eor X6, X6, X2
				231	eor X7, X7, X3
				232	stmia r14!, {X4-X7}
				233
				234	// x8-x15
				235	pop {r0-r7} // (x8-x9,x12-x15,x10-x11)
				236	__ldrd r8, r9, sp, 32
				237	__ldrd r10, r11, sp, 40
				238	add r0, r0, r8 // x8
				239	add r1, r1, r9 // x9
				240	add r6, r6, r10 // x10
				241	add r7, r7, r11 // x11
				242	_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
				243	ldmia r12!, {r8-r11}
				244	eor r0, r0, r8 // x8
				245	eor r1, r1, r9 // x9
				246	eor r6, r6, r10 // x10
				247	eor r7, r7, r11 // x11
				248	stmia r14!, {r0,r1,r6,r7}
				249	ldmia r12!, {r0,r1,r6,r7}
				250	__ldrd r8, r9, sp, 48
				251	__ldrd r10, r11, sp, 56
				252	add r2, r8, r2, ror #drot // x12
				253	add r3, r9, r3, ror #drot // x13
				254	add r4, r10, r4, ror #drot // x14
				255	add r5, r11, r5, ror #drot // x15
				256	_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
				257	ldr r9, [sp, #72] // load LEN
				258	eor r2, r2, r0 // x12
				259	eor r3, r3, r1 // x13
				260	eor r4, r4, r6 // x14
				261	eor r5, r5, r7 // x15
				262	subs r9, #64 // decrement and check LEN
				263	stmia r14!, {r2-r5}
				264
				265	beq .Ldone\@
				266
				267	.Lprepare_for_next_block\@:
				268
				269	// Stack: x0-x15 OUT IN LEN
				270
				271	// Increment block counter (x12)
				272	add r8, #1
				273
				274	// Store updated (OUT, IN, LEN)
				275	str r14, [sp, #64]
				276	str r12, [sp, #68]
				277	str r9, [sp, #72]
				278
				279	mov r14, sp
				280
				281	// Store updated block counter (x12)
				282	str r8, [sp, #48]
				283
				284	sub sp, #16
				285
				286	// Reload state and do next block
				287	ldmia r14!, {r0-r11} // load x0-x11
				288	__strd r10, r11, sp, 8 // store x10-x11 before state
				289	ldmia r14, {r10-r12,r14} // load x12-x15
				290	b .Lnext_block\@
				291
				292	.Lxor_slowpath\@:
				293	// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
				294	// We handle it by storing the 64 bytes of keystream to the stack, then
				295	// XOR-ing the needed portion with the data.
				296
				297	// Allocate keystream buffer
				298	sub sp, #64
				299	mov r14, sp
				300
				301	// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
				302	// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
				303	// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
				304
				305	// Save keystream for x0-x3
				306	__ldrd r8, r9, sp, 96
				307	__ldrd r10, r11, sp, 104
				308	add X0, X0, r8
				309	add X1, X1, r9
				310	add X2, X2, r10
				311	add X3, X3, r11
				312	_le32_bswap_4x X0, X1, X2, X3, r8, r9, r10
				313	stmia r14!, {X0-X3}
				314
				315	// Save keystream for x4-x7
				316	__ldrd r8, r9, sp, 112
				317	__ldrd r10, r11, sp, 120
				318	add X4, r8, X4, ror #brot
				319	add X5, r9, X5, ror #brot
				320	add X6, r10, X6, ror #brot
				321	add X7, r11, X7, ror #brot
				322	_le32_bswap_4x X4, X5, X6, X7, r8, r9, r10
				323	add r8, sp, #64
				324	stmia r14!, {X4-X7}
				325
				326	// Save keystream for x8-x15
				327	ldm r8, {r0-r7} // (x8-x9,x12-x15,x10-x11)
				328	__ldrd r8, r9, sp, 128
				329	__ldrd r10, r11, sp, 136
				330	add r0, r0, r8 // x8
				331	add r1, r1, r9 // x9
				332	add r6, r6, r10 // x10
				333	add r7, r7, r11 // x11
				334	_le32_bswap_4x r0, r1, r6, r7, r8, r9, r10
				335	stmia r14!, {r0,r1,r6,r7}
				336	__ldrd r8, r9, sp, 144
				337	__ldrd r10, r11, sp, 152
				338	add r2, r8, r2, ror #drot // x12
				339	add r3, r9, r3, ror #drot // x13
				340	add r4, r10, r4, ror #drot // x14
				341	add r5, r11, r5, ror #drot // x15
				342	_le32_bswap_4x r2, r3, r4, r5, r9, r10, r11
				343	stmia r14, {r2-r5}
				344
				345	// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
				346	// Registers: r8 is block counter, r12 is IN.
				347
				348	ldr r9, [sp, #168] // LEN
				349	ldr r14, [sp, #160] // OUT
				350	cmp r9, #64
				351	mov r0, sp
				352	movle r1, r9
				353	movgt r1, #64
				354	// r1 is number of bytes to XOR, in range [1, 64]
				355
				356	.if __LINUX_ARM_ARCH__ < 6
				357	orr r2, r12, r14
				358	tst r2, #3 // IN or OUT misaligned?
				359	bne .Lxor_next_byte\@
				360	.endif
				361
				362	// XOR a word at a time
				363	.rept 16
				364	subs r1, #4
				365	blt .Lxor_words_done\@
				366	ldr r2, [r12], #4
				367	ldr r3, [r0], #4
				368	eor r2, r2, r3
				369	str r2, [r14], #4
				370	.endr
				371	b .Lxor_slowpath_done\@
				372	.Lxor_words_done\@:
				373	ands r1, r1, #3
				374	beq .Lxor_slowpath_done\@
				375
				376	// XOR a byte at a time
				377	.Lxor_next_byte\@:
				378	ldrb r2, [r12], #1
				379	ldrb r3, [r0], #1
				380	eor r2, r2, r3
				381	strb r2, [r14], #1
				382	subs r1, #1
				383	bne .Lxor_next_byte\@
				384
				385	.Lxor_slowpath_done\@:
				386	subs r9, #64
				387	add sp, #96
				388	bgt .Lprepare_for_next_block\@
				389
				390	.Ldone\@:
				391	.endm // _chacha
				392
				393	/*
				394	* void chacha20_arm(u8 out, const u8 in, size_t len, const u32 key[8],
				395	* const u32 iv[4]);
				396	*/
				397	SYM_FUNC_START(chacha20_arm)
				398	cmp r2, #0 // len == 0?
				399	reteq lr
				400
				401	push {r0-r2,r4-r11,lr}
				402
				403	// Push state x0-x15 onto stack.
				404	// Also store an extra copy of x10-x11 just before the state.
				405
				406	ldr r4, [sp, #48] // iv
				407	mov r0, sp
				408	sub sp, #80
				409
				410	// iv: x12-x15
				411	ldm r4, {X12,X13,X14,X15}
				412	stmdb r0!, {X12,X13,X14,X15}
				413
				414	// key: x4-x11
				415	__ldrd X8_X10, X9_X11, r3, 24
				416	__strd X8_X10, X9_X11, sp, 8
				417	stmdb r0!, {X8_X10, X9_X11}
				418	ldm r3, {X4-X9_X11}
				419	stmdb r0!, {X4-X9_X11}
				420
				421	// constants: x0-x3
				422	adrl X3, .Lexpand_32byte_k
				423	ldm X3, {X0-X3}
				424	__strd X0, X1, sp, 16
				425	__strd X2, X3, sp, 24
				426
				427	_chacha 20
				428
				429	add sp, #76
				430	pop {r4-r11, pc}
				431	SYM_FUNC_END(chacha20_arm)
				432
				433	/*
				434	* void hchacha20_arm(const u32 state[16], u32 out[8]);
				435	*/
				436	SYM_FUNC_START(hchacha20_arm)
				437	push {r1,r4-r11,lr}
				438
				439	mov r14, r0
				440	ldmia r14!, {r0-r11} // load x0-x11
				441	push {r10-r11} // store x10-x11 to stack
				442	ldm r14, {r10-r12,r14} // load x12-x15
				443	sub sp, #8
				444
				445	_chacha_permute 20
				446
				447	// Skip over (unused0-unused1, x10-x11)
				448	add sp, #16
				449
				450	// Fix up rotations of x12-x15
				451	ror X12, X12, #drot
				452	ror X13, X13, #drot
				453	pop {r4} // load 'out'
				454	ror X14, X14, #drot
				455	ror X15, X15, #drot
				456
				457	// Store (x0-x3,x12-x15) to 'out'
				458	stm r4, {X0,X1,X2,X3,X12,X13,X14,X15}
				459
				460	pop {r4-r11,pc}
				461	SYM_FUNC_END(hchacha20_arm)