Blame - arch/x86/lib/memset_64.S - codeaurora/cp-linux

blob: 2661fad0582716f780af9904dc5b7c62199a5c58 [file] [log] [blame]

Kyle Swenson	8d8f654	2021-03-15 11:02:55 -0600	[diff] [blame^]	1	/* Copyright 2002 Andi Kleen, SuSE Labs */
				2
				3	#include <linux/linkage.h>
				4	#include <asm/cpufeature.h>
				5	#include <asm/alternative-asm.h>
				6
				7	.weak memset
				8
				9	/*
				10	* ISO C memset - set a memory block to a byte value. This function uses fast
				11	* string to get better performance than the original function. The code is
				12	* simpler and shorter than the orignal function as well.
				13	*
				14	* rdi destination
				15	* rsi value (char)
				16	* rdx count (bytes)
				17	*
				18	* rax original destination
				19	*/
				20	ENTRY(memset)
				21	ENTRY(__memset)
				22	/*
				23	* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
				24	* to use it when possible. If not available, use fast string instructions.
				25	*
				26	* Otherwise, use original memset function.
				27	*/
				28	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
				29	"jmp memset_erms", X86_FEATURE_ERMS
				30
				31	movq %rdi,%r9
				32	movq %rdx,%rcx
				33	andl $7,%edx
				34	shrq $3,%rcx
				35	/* expand byte value */
				36	movzbl %sil,%esi
				37	movabs $0x0101010101010101,%rax
				38	imulq %rsi,%rax
				39	rep stosq
				40	movl %edx,%ecx
				41	rep stosb
				42	movq %r9,%rax
				43	ret
				44	ENDPROC(memset)
				45	ENDPROC(__memset)
				46
				47	/*
				48	* ISO C memset - set a memory block to a byte value. This function uses
				49	* enhanced rep stosb to override the fast string function.
				50	* The code is simpler and shorter than the fast string function as well.
				51	*
				52	* rdi destination
				53	* rsi value (char)
				54	* rdx count (bytes)
				55	*
				56	* rax original destination
				57	*/
				58	ENTRY(memset_erms)
				59	movq %rdi,%r9
				60	movb %sil,%al
				61	movq %rdx,%rcx
				62	rep stosb
				63	movq %r9,%rax
				64	ret
				65	ENDPROC(memset_erms)
				66
				67	ENTRY(memset_orig)
				68	movq %rdi,%r10
				69
				70	/* expand byte value */
				71	movzbl %sil,%ecx
				72	movabs $0x0101010101010101,%rax
				73	imulq %rcx,%rax
				74
				75	/* align dst */
				76	movl %edi,%r9d
				77	andl $7,%r9d
				78	jnz .Lbad_alignment
				79	.Lafter_bad_alignment:
				80
				81	movq %rdx,%rcx
				82	shrq $6,%rcx
				83	jz .Lhandle_tail
				84
				85	.p2align 4
				86	.Lloop_64:
				87	decq %rcx
				88	movq %rax,(%rdi)
				89	movq %rax,8(%rdi)
				90	movq %rax,16(%rdi)
				91	movq %rax,24(%rdi)
				92	movq %rax,32(%rdi)
				93	movq %rax,40(%rdi)
				94	movq %rax,48(%rdi)
				95	movq %rax,56(%rdi)
				96	leaq 64(%rdi),%rdi
				97	jnz .Lloop_64
				98
				99	/* Handle tail in loops. The loops should be faster than hard
				100	to predict jump tables. */
				101	.p2align 4
				102	.Lhandle_tail:
				103	movl %edx,%ecx
				104	andl $63&(~7),%ecx
				105	jz .Lhandle_7
				106	shrl $3,%ecx
				107	.p2align 4
				108	.Lloop_8:
				109	decl %ecx
				110	movq %rax,(%rdi)
				111	leaq 8(%rdi),%rdi
				112	jnz .Lloop_8
				113
				114	.Lhandle_7:
				115	andl $7,%edx
				116	jz .Lende
				117	.p2align 4
				118	.Lloop_1:
				119	decl %edx
				120	movb %al,(%rdi)
				121	leaq 1(%rdi),%rdi
				122	jnz .Lloop_1
				123
				124	.Lende:
				125	movq %r10,%rax
				126	ret
				127
				128	.Lbad_alignment:
				129	cmpq $7,%rdx
				130	jbe .Lhandle_7
				131	movq %rax,(%rdi) /* unaligned store */
				132	movq $8,%r8
				133	subq %r9,%r8
				134	addq %r8,%rdi
				135	subq %r8,%rdx
				136	jmp .Lafter_bad_alignment
				137	.Lfinal:
				138	ENDPROC(memset_orig)