Add an uncached (based on MOVNTI) implementation of memset(). memset_movnti() only needs to differ from memset_orig() in the opcode used in the inner loop, so move the memset_orig() logic into a macro, and use that to generate memset_movq() and memset_movnti(). Signed-off-by: Ankur Arora <ankur.a.arora@xxxxxxxxxx> --- arch/x86/lib/memset_64.S | 68 ++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 9827ae267f96..ef2a091563d9 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -25,7 +25,7 @@ SYM_FUNC_START(__memset) * * Otherwise, use original memset function. */ - ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ + ALTERNATIVE_2 "jmp memset_movq", "", X86_FEATURE_REP_GOOD, \ "jmp memset_erms", X86_FEATURE_ERMS movq %rdi,%r9 @@ -66,7 +66,8 @@ SYM_FUNC_START_LOCAL(memset_erms) ret SYM_FUNC_END(memset_erms) -SYM_FUNC_START_LOCAL(memset_orig) +.macro MEMSET_MOV OP fence +SYM_FUNC_START_LOCAL(memset_\OP) movq %rdi,%r10 /* expand byte value */ @@ -77,64 +78,71 @@ SYM_FUNC_START_LOCAL(memset_orig) /* align dst */ movl %edi,%r9d andl $7,%r9d - jnz .Lbad_alignment -.Lafter_bad_alignment: + jnz .Lbad_alignment_\@ +.Lafter_bad_alignment_\@: movq %rdx,%rcx shrq $6,%rcx - jz .Lhandle_tail + jz .Lhandle_tail_\@ .p2align 4 -.Lloop_64: +.Lloop_64_\@: decq %rcx - movq %rax,(%rdi) - movq %rax,8(%rdi) - movq %rax,16(%rdi) - movq %rax,24(%rdi) - movq %rax,32(%rdi) - movq %rax,40(%rdi) - movq %rax,48(%rdi) - movq %rax,56(%rdi) + \OP %rax,(%rdi) + \OP %rax,8(%rdi) + \OP %rax,16(%rdi) + \OP %rax,24(%rdi) + \OP %rax,32(%rdi) + \OP %rax,40(%rdi) + \OP %rax,48(%rdi) + \OP %rax,56(%rdi) leaq 64(%rdi),%rdi - jnz .Lloop_64 + jnz .Lloop_64_\@ /* Handle tail in loops. The loops should be faster than hard to predict jump tables. */ .p2align 4 -.Lhandle_tail: +.Lhandle_tail_\@: movl %edx,%ecx andl $63&(~7),%ecx - jz .Lhandle_7 + jz .Lhandle_7_\@ shrl $3,%ecx .p2align 4 -.Lloop_8: +.Lloop_8_\@: decl %ecx - movq %rax,(%rdi) + \OP %rax,(%rdi) leaq 8(%rdi),%rdi - jnz .Lloop_8 + jnz .Lloop_8_\@ -.Lhandle_7: +.Lhandle_7_\@: andl $7,%edx - jz .Lende + jz .Lende_\@ .p2align 4 -.Lloop_1: +.Lloop_1_\@: decl %edx movb %al,(%rdi) leaq 1(%rdi),%rdi - jnz .Lloop_1 + jnz .Lloop_1_\@ -.Lende: +.Lende_\@: + .if \fence + sfence + .endif movq %r10,%rax ret -.Lbad_alignment: +.Lbad_alignment_\@: cmpq $7,%rdx - jbe .Lhandle_7 + jbe .Lhandle_7_\@ movq %rax,(%rdi) /* unaligned store */ movq $8,%r8 subq %r9,%r8 addq %r8,%rdi subq %r8,%rdx - jmp .Lafter_bad_alignment -.Lfinal: -SYM_FUNC_END(memset_orig) + jmp .Lafter_bad_alignment_\@ +.Lfinal_\@: +SYM_FUNC_END(memset_\OP) +.endm + +MEMSET_MOV OP=movq fence=0 +MEMSET_MOV OP=movnti fence=1 -- 2.29.2