Backporting to 5.16 looks good to me. Michael On Tue, 2022-04-12 at 08:29 +0200, Greg Kroah-Hartman wrote: > From: Michael T. Kloos <michael@xxxxxxxxxxxxxxxx> > > [ Upstream commit 9d1f0ec9f71780e69ceb9d91697600c747d6e02e ] > > Rewrote the RISC-V memmove() assembly implementation. The > previous implementation did not check memory alignment and it > compared 2 pointers with a signed comparison. The misaligned > memory access would cause the kernel to crash on systems that > did not emulate it in firmware and did not support it in hardware. > Firmware emulation is slow and may not exist. The RISC-V spec > does not guarantee that support for misaligned memory accesses > will exist. It should not be depended on. > > This patch now checks for XLEN granularity of co-alignment between > the pointers. Failing that, copying is done by loading from the 2 > contiguous and naturally aligned XLEN memory locations containing > the overlapping XLEN sized data to be copied. The data is shifted > into the correct place and binary or'ed together on each > iteration. The result is then stored into the corresponding > naturally aligned XLEN sized location in the destination. For > unaligned data at the terminations of the regions to be copied > or for copies less than (2 * XLEN) in size, byte copy is used. > > This patch also now uses unsigned comparison for the pointers and > migrates to the newer assembler annotations from the now deprecated > ones. > > Signed-off-by: Michael T. Kloos <michael@xxxxxxxxxxxxxxxx> > Signed-off-by: Palmer Dabbelt <palmer@xxxxxxxxxxxx> > Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx> > --- > arch/riscv/lib/memmove.S | 368 +++++++++++++++++++++++++++++++++------ > 1 file changed, 310 insertions(+), 58 deletions(-) > > diff --git a/arch/riscv/lib/memmove.S b/arch/riscv/lib/memmove.S > index 07d1d2152ba5..e0609e1f0864 100644 > --- a/arch/riscv/lib/memmove.S > +++ b/arch/riscv/lib/memmove.S > @@ -1,64 +1,316 @@ > -/* SPDX-License-Identifier: GPL-2.0 */ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (C) 2022 Michael T. Kloos <michael@xxxxxxxxxxxxxxxx> > + */ > > #include <linux/linkage.h> > #include <asm/asm.h> > > -ENTRY(__memmove) > -WEAK(memmove) > - move t0, a0 > - move t1, a1 > - > - beq a0, a1, exit_memcpy > - beqz a2, exit_memcpy > - srli t2, a2, 0x2 > - > - slt t3, a0, a1 > - beqz t3, do_reverse > - > - andi a2, a2, 0x3 > - li t4, 1 > - beqz t2, byte_copy > - > -word_copy: > - lw t3, 0(a1) > - addi t2, t2, -1 > - addi a1, a1, 4 > - sw t3, 0(a0) > - addi a0, a0, 4 > - bnez t2, word_copy > - beqz a2, exit_memcpy > - j byte_copy > - > -do_reverse: > - add a0, a0, a2 > - add a1, a1, a2 > - andi a2, a2, 0x3 > - li t4, -1 > - beqz t2, reverse_byte_copy > - > -reverse_word_copy: > - addi a1, a1, -4 > - addi t2, t2, -1 > - lw t3, 0(a1) > - addi a0, a0, -4 > - sw t3, 0(a0) > - bnez t2, reverse_word_copy > - beqz a2, exit_memcpy > - > -reverse_byte_copy: > - addi a0, a0, -1 > - addi a1, a1, -1 > +SYM_FUNC_START(__memmove) > +SYM_FUNC_START_WEAK(memmove) > + /* > + * Returns > + * a0 - dest > + * > + * Parameters > + * a0 - Inclusive first byte of dest > + * a1 - Inclusive first byte of src > + * a2 - Length of copy n > + * > + * Because the return matches the parameter register a0, > + * we will not clobber or modify that register. > + * > + * Note: This currently only works on little-endian. > + * To port to big-endian, reverse the direction of shifts > + * in the 2 misaligned fixup copy loops. > + */ > > + /* Return if nothing to do */ > + beq a0, a1, return_from_memmove > + beqz a2, return_from_memmove > + > + /* > + * Register Uses > + * Forward Copy: a1 - Index counter of src > + * Reverse Copy: a4 - Index counter of src > + * Forward Copy: t3 - Index counter of dest > + * Reverse Copy: t4 - Index counter of dest > + * Both Copy Modes: t5 - Inclusive first multibyte/aligned of dest > + * Both Copy Modes: t6 - Non-Inclusive last multibyte/aligned of dest > + * Both Copy Modes: t0 - Link / Temporary for load-store > + * Both Copy Modes: t1 - Temporary for load-store > + * Both Copy Modes: t2 - Temporary for load-store > + * Both Copy Modes: a5 - dest to src alignment offset > + * Both Copy Modes: a6 - Shift ammount > + * Both Copy Modes: a7 - Inverse Shift ammount > + * Both Copy Modes: a2 - Alternate breakpoint for unrolled loops > + */ > + > + /* > + * Solve for some register values now. > + * Byte copy does not need t5 or t6. > + */ > + mv t3, a0 > + add t4, a0, a2 > + add a4, a1, a2 > + > + /* > + * Byte copy if copying less than (2 * SZREG) bytes. This can > + * cause problems with the bulk copy implementation and is > + * small enough not to bother. > + */ > + andi t0, a2, -(2 * SZREG) > + beqz t0, byte_copy > + > + /* > + * Now solve for t5 and t6. > + */ > + andi t5, t3, -SZREG > + andi t6, t4, -SZREG > + /* > + * If dest(Register t3) rounded down to the nearest naturally > + * aligned SZREG address, does not equal dest, then add SZREG > + * to find the low-bound of SZREG alignment in the dest memory > + * region. Note that this could overshoot the dest memory > + * region if n is less than SZREG. This is one reason why > + * we always byte copy if n is less than SZREG. > + * Otherwise, dest is already naturally aligned to SZREG. > + */ > + beq t5, t3, 1f > + addi t5, t5, SZREG > + 1: > + > + /* > + * If the dest and src are co-aligned to SZREG, then there is > + * no need for the full rigmarole of a full misaligned fixup copy. > + * Instead, do a simpler co-aligned copy. > + */ > + xor t0, a0, a1 > + andi t1, t0, (SZREG - 1) > + beqz t1, coaligned_copy > + /* Fall through to misaligned fixup copy */ > + > +misaligned_fixup_copy: > + bltu a1, a0, misaligned_fixup_copy_reverse > + > +misaligned_fixup_copy_forward: > + jal t0, byte_copy_until_aligned_forward > + > + andi a5, a1, (SZREG - 1) /* Find the alignment offset of src (a1) */ > + slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ > + sub a5, a1, t3 /* Find the difference between src and dest */ > + andi a1, a1, -SZREG /* Align the src pointer */ > + addi a2, t6, SZREG /* The other breakpoint for the unrolled loop*/ > + > + /* > + * Compute The Inverse Shift > + * a7 = XLEN - a6 = XLEN + -a6 > + * 2s complement negation to find the negative: -a6 = ~a6 + 1 > + * Add that to XLEN. XLEN = SZREG * 8. > + */ > + not a7, a6 > + addi a7, a7, (SZREG * 8 + 1) > + > + /* > + * Fix Misalignment Copy Loop - Forward > + * load_val0 = load_ptr[0]; > + * do { > + * load_val1 = load_ptr[1]; > + * store_ptr += 2; > + * store_ptr[0 - 2] = (load_val0 >> {a6}) | (load_val1 << {a7}); > + * > + * if (store_ptr == {a2}) > + * break; > + * > + * load_val0 = load_ptr[2]; > + * load_ptr += 2; > + * store_ptr[1 - 2] = (load_val1 >> {a6}) | (load_val0 << {a7}); > + * > + * } while (store_ptr != store_ptr_end); > + * store_ptr = store_ptr_end; > + */ > + > + REG_L t0, (0 * SZREG)(a1) > + 1: > + REG_L t1, (1 * SZREG)(a1) > + addi t3, t3, (2 * SZREG) > + srl t0, t0, a6 > + sll t2, t1, a7 > + or t2, t0, t2 > + REG_S t2, ((0 * SZREG) - (2 * SZREG))(t3) > + > + beq t3, a2, 2f > + > + REG_L t0, (2 * SZREG)(a1) > + addi a1, a1, (2 * SZREG) > + srl t1, t1, a6 > + sll t2, t0, a7 > + or t2, t1, t2 > + REG_S t2, ((1 * SZREG) - (2 * SZREG))(t3) > + > + bne t3, t6, 1b > + 2: > + mv t3, t6 /* Fix the dest pointer in case the loop was broken */ > + > + add a1, t3, a5 /* Restore the src pointer */ > + j byte_copy_forward /* Copy any remaining bytes */ > + > +misaligned_fixup_copy_reverse: > + jal t0, byte_copy_until_aligned_reverse > + > + andi a5, a4, (SZREG - 1) /* Find the alignment offset of src (a4) */ > + slli a6, a5, 3 /* Multiply by 8 to convert that to bits to shift */ > + sub a5, a4, t4 /* Find the difference between src and dest */ > + andi a4, a4, -SZREG /* Align the src pointer */ > + addi a2, t5, -SZREG /* The other breakpoint for the unrolled loop*/ > + > + /* > + * Compute The Inverse Shift > + * a7 = XLEN - a6 = XLEN + -a6 > + * 2s complement negation to find the negative: -a6 = ~a6 + 1 > + * Add that to XLEN. XLEN = SZREG * 8. > + */ > + not a7, a6 > + addi a7, a7, (SZREG * 8 + 1) > + > + /* > + * Fix Misalignment Copy Loop - Reverse > + * load_val1 = load_ptr[0]; > + * do { > + * load_val0 = load_ptr[-1]; > + * store_ptr -= 2; > + * store_ptr[1] = (load_val0 >> {a6}) | (load_val1 << {a7}); > + * > + * if (store_ptr == {a2}) > + * break; > + * > + * load_val1 = load_ptr[-2]; > + * load_ptr -= 2; > + * store_ptr[0] = (load_val1 >> {a6}) | (load_val0 << {a7}); > + * > + * } while (store_ptr != store_ptr_end); > + * store_ptr = store_ptr_end; > + */ > + > + REG_L t1, ( 0 * SZREG)(a4) > + 1: > + REG_L t0, (-1 * SZREG)(a4) > + addi t4, t4, (-2 * SZREG) > + sll t1, t1, a7 > + srl t2, t0, a6 > + or t2, t1, t2 > + REG_S t2, ( 1 * SZREG)(t4) > + > + beq t4, a2, 2f > + > + REG_L t1, (-2 * SZREG)(a4) > + addi a4, a4, (-2 * SZREG) > + sll t0, t0, a7 > + srl t2, t1, a6 > + or t2, t0, t2 > + REG_S t2, ( 0 * SZREG)(t4) > + > + bne t4, t5, 1b > + 2: > + mv t4, t5 /* Fix the dest pointer in case the loop was broken */ > + > + add a4, t4, a5 /* Restore the src pointer */ > + j byte_copy_reverse /* Copy any remaining bytes */ > + > +/* > + * Simple copy loops for SZREG co-aligned memory locations. > + * These also make calls to do byte copies for any unaligned > + * data at their terminations. > + */ > +coaligned_copy: > + bltu a1, a0, coaligned_copy_reverse > + > +coaligned_copy_forward: > + jal t0, byte_copy_until_aligned_forward > + > + 1: > + REG_L t1, ( 0 * SZREG)(a1) > + addi a1, a1, SZREG > + addi t3, t3, SZREG > + REG_S t1, (-1 * SZREG)(t3) > + bne t3, t6, 1b > + > + j byte_copy_forward /* Copy any remaining bytes */ > + > +coaligned_copy_reverse: > + jal t0, byte_copy_until_aligned_reverse > + > + 1: > + REG_L t1, (-1 * SZREG)(a4) > + addi a4, a4, -SZREG > + addi t4, t4, -SZREG > + REG_S t1, ( 0 * SZREG)(t4) > + bne t4, t5, 1b > + > + j byte_copy_reverse /* Copy any remaining bytes */ > + > +/* > + * These are basically sub-functions within the function. They > + * are used to byte copy until the dest pointer is in alignment. > + * At which point, a bulk copy method can be used by the > + * calling code. These work on the same registers as the bulk > + * copy loops. Therefore, the register values can be picked > + * up from where they were left and we avoid code duplication > + * without any overhead except the call in and return jumps. > + */ > +byte_copy_until_aligned_forward: > + beq t3, t5, 2f > + 1: > + lb t1, 0(a1) > + addi a1, a1, 1 > + addi t3, t3, 1 > + sb t1, -1(t3) > + bne t3, t5, 1b > + 2: > + jalr zero, 0x0(t0) /* Return to multibyte copy loop */ > + > +byte_copy_until_aligned_reverse: > + beq t4, t6, 2f > + 1: > + lb t1, -1(a4) > + addi a4, a4, -1 > + addi t4, t4, -1 > + sb t1, 0(t4) > + bne t4, t6, 1b > + 2: > + jalr zero, 0x0(t0) /* Return to multibyte copy loop */ > + > +/* > + * Simple byte copy loops. > + * These will byte copy until they reach the end of data to copy. > + * At that point, they will call to return from memmove. > + */ > byte_copy: > - lb t3, 0(a1) > - addi a2, a2, -1 > - sb t3, 0(a0) > - add a1, a1, t4 > - add a0, a0, t4 > - bnez a2, byte_copy > - > -exit_memcpy: > - move a0, t0 > - move a1, t1 > - ret > -END(__memmove) > + bltu a1, a0, byte_copy_reverse > + > +byte_copy_forward: > + beq t3, t4, 2f > + 1: > + lb t1, 0(a1) > + addi a1, a1, 1 > + addi t3, t3, 1 > + sb t1, -1(t3) > + bne t3, t4, 1b > + 2: > + ret > + > +byte_copy_reverse: > + beq t4, t3, 2f > + 1: > + lb t1, -1(a4) > + addi a4, a4, -1 > + addi t4, t4, -1 > + sb t1, 0(t4) > + bne t4, t3, 1b > + 2: > + > +return_from_memmove: > + ret > + > +SYM_FUNC_END(memmove) > +SYM_FUNC_END(__memmove)