Re: [PATCH v3 7/8] RISC-V: Report vector unaligned access speed hwprobe

Jesse Taube <jesse@xxxxxxxxxxxx> · Thu, 11 Jul 2024 16:35:48 -0400

On 7/1/24 18:51, Evan Green wrote:
On Mon, Jun 24, 2024 at 5:52 PM Jesse Taube <jesse@xxxxxxxxxxxx> wrote:

Detect if vector misaligned accesses are faster or slower than
equivalent vector byte accesses. This is useful for usermode to know
whether vector byte accesses or vector misaligned accesses have a better
bandwidth for operations like memcpy.

Signed-off-by: Jesse Taube <jesse@xxxxxxxxxxxx>
---
V1 -> V2:
  - Add Kconfig options
  - Add WORD_EEW to vec-copy-unaligned.S
V2 -> V3:
  - Remove unnecessary comment
  - Remove local_irq_enable
---
  arch/riscv/Kconfig                         |  18 +++
  arch/riscv/kernel/Makefile                 |   3 +-
  arch/riscv/kernel/copy-unaligned.h         |   5 +
  arch/riscv/kernel/sys_hwprobe.c            |   6 +
  arch/riscv/kernel/unaligned_access_speed.c | 132 ++++++++++++++++++++-
  arch/riscv/kernel/vec-copy-unaligned.S     |  58 +++++++++
  6 files changed, 219 insertions(+), 3 deletions(-)
  create mode 100644 arch/riscv/kernel/vec-copy-unaligned.S

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index ffbe0fdd7fb3..6f9fd3748916 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -807,6 +807,24 @@ config RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
           will dynamically determine the speed of vector unaligned accesses on
           the underlying system if they are supported.

+config RISCV_SLOW_VEC_UNALIGNED_ACCESS
+       bool "Assume the system supports slow vector unaligned memory accesses"
+       depends on NONPORTABLE
+       help
+         Assume that the system supports slow vector unaligned memory accesses. The
+         kernel and userspace programs may not be able to run at all on systems
+         that do not support unaligned memory accesses.
+
+config RISCV_EFFICIENT_VEC_UNALIGNED_ACCESS
+       bool "Assume the system supports fast vector unaligned memory accesses"
+       depends on NONPORTABLE
+       help
+         Assume that the system supports fast vector unaligned memory accesses. When
+         enabled, this option improves the performance of the kernel on such
+         systems. However, the kernel and userspace programs will run much more
+         slowly, or will not be able to run at all, on systems that do not
+         support efficient unaligned memory accesses.
+
  endchoice

  endmenu # "Platform type"
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 5b243d46f4b1..291935a084d5 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -64,7 +64,8 @@ obj-$(CONFIG_MMU) += vdso.o vdso/

  obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
  obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
-obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)     += copy-unaligned.o
+obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)             += copy-unaligned.o
+obj-$(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)      += vec-copy-unaligned.o

  obj-$(CONFIG_FPU)              += fpu.o
  obj-$(CONFIG_FPU)              += kernel_mode_fpu.o
diff --git a/arch/riscv/kernel/copy-unaligned.h b/arch/riscv/kernel/copy-unaligned.h
index e3d70d35b708..85d4d11450cb 100644
--- a/arch/riscv/kernel/copy-unaligned.h
+++ b/arch/riscv/kernel/copy-unaligned.h
@@ -10,4 +10,9 @@
  void __riscv_copy_words_unaligned(void *dst, const void *src, size_t size);
  void __riscv_copy_bytes_unaligned(void *dst, const void *src, size_t size);

+#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
+void __riscv_copy_vec_words_unaligned(void *dst, const void *src, size_t size);
+void __riscv_copy_vec_bytes_unaligned(void *dst, const void *src, size_t size);
+#endif
+
  #endif /* __RISCV_KERNEL_COPY_UNALIGNED_H */
diff --git a/arch/riscv/kernel/sys_hwprobe.c b/arch/riscv/kernel/sys_hwprobe.c
index 5b78ea5a84d1..46de3f145154 100644
--- a/arch/riscv/kernel/sys_hwprobe.c
+++ b/arch/riscv/kernel/sys_hwprobe.c
@@ -221,6 +221,12 @@ static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
  #else
  static u64 hwprobe_vec_misaligned(const struct cpumask *cpus)
  {
+       if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_VEC_UNALIGNED_ACCESS))
+               return RISCV_HWPROBE_VEC_MISALIGNED_FAST;
+
+       if (IS_ENABLED(CONFIG_RISCV_SLOW_VEC_UNALIGNED_ACCESS))
+               return RISCV_HWPROBE_VEC_MISALIGNED_SLOW;
+
         return RISCV_HWPROBE_VEC_MISALIGNED_UNKNOWN;
  }
  #endif
diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
index 8489e012cf23..a2a8f948500b 100644
--- a/arch/riscv/kernel/unaligned_access_speed.c
+++ b/arch/riscv/kernel/unaligned_access_speed.c
@@ -8,9 +8,11 @@
  #include <linux/jump_label.h>
  #include <linux/mm.h>
  #include <linux/smp.h>
+#include <linux/kthread.h>
  #include <linux/types.h>
  #include <asm/cpufeature.h>
  #include <asm/hwprobe.h>
+#include <asm/vector.h>

  #include "copy-unaligned.h"

@@ -267,9 +269,129 @@ static int check_unaligned_access_speed_all_cpus(void)
  }
  #endif

+#ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
+static void check_vector_unaligned_access(struct work_struct *unused)
+{
+       int cpu = smp_processor_id();
+       u64 start_cycles, end_cycles;
+       u64 word_cycles;
+       u64 byte_cycles;
+       int ratio;
+       unsigned long start_jiffies, now;
+       struct page *page;
+       void *dst;
+       void *src;
+       long speed = RISCV_HWPROBE_VEC_MISALIGNED_SLOW;
+
+       if (per_cpu(vector_misaligned_access, cpu) != RISCV_HWPROBE_VEC_MISALIGNED_SLOW)
+               return;
+
+       page = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
+       if (!page) {
+               pr_warn("Allocation failure, not measuring vector misaligned performance\n");
+               return;
+       }
+
+       /* Make an unaligned destination buffer. */
+       dst = (void *)((unsigned long)page_address(page) | 0x1);
+       /* Unalign src as well, but differently (off by 1 + 2 = 3). */
+       src = dst + (MISALIGNED_BUFFER_SIZE / 2);
+       src += 2;
+       word_cycles = -1ULL;
+
+       /* Do a warmup. */
+       kernel_vector_begin();

Should there be a preempt_disable() in here too, or is that implied
already by schedule_on_each_cpu? Mainly we want to minimize the amount
of things that might soak up our jiffy and steal iterations from the
loop.

To test I put a schedule here and it says OOPS we are attomic.

I added preempt_disable and preempt_enable to be sure preemption is 
disabled anyway.


+       __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+
+       start_jiffies = jiffies;
+       while ((now = jiffies) == start_jiffies)
+               cpu_relax();
+
+       /*
+        * For a fixed amount of time, repeatedly try the function, and take
+        * the best time in cycles as the measurement.
+        */
+       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
+               start_cycles = get_cycles64();
+               /* Ensure the CSR read can't reorder WRT to the copy. */
+               mb();
+               __riscv_copy_vec_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+               /* Ensure the copy ends before the end time is snapped. */
+               mb();
+               end_cycles = get_cycles64();
+               if ((end_cycles - start_cycles) < word_cycles)
+                       word_cycles = end_cycles - start_cycles;
+       }
+
+       byte_cycles = -1ULL;
+       __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+       start_jiffies = jiffies;
+       while ((now = jiffies) == start_jiffies)
+               cpu_relax();
+
+       while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
+               start_cycles = get_cycles64();
+               mb();
+               __riscv_copy_vec_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
+               mb();
+               end_cycles = get_cycles64();
+               if ((end_cycles - start_cycles) < byte_cycles)
+                       byte_cycles = end_cycles - start_cycles;
+       }
+
+       kernel_vector_end();
+
+       /* Don't divide by zero. */
+       if (!word_cycles || !byte_cycles) {
+               pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned vector access speed\n",
+                       cpu);
+
+               return;
+       }
+
+       if (word_cycles < byte_cycles)
+               speed = RISCV_HWPROBE_VEC_MISALIGNED_FAST;
+
+       ratio = div_u64((byte_cycles * 100), word_cycles);
+       pr_info("cpu%d: Ratio of vector byte access time to vector unaligned word access is %d.%02d, unaligned accesses are %s\n",
+               cpu,
+               ratio / 100,
+               ratio % 100,
+               (speed ==  RISCV_HWPROBE_VEC_MISALIGNED_FAST) ? "fast" : "slow");
+
+       per_cpu(vector_misaligned_access, cpu) = speed;
+}
+
+static int riscv_online_cpu_vec(unsigned int cpu)
+{
+       check_vector_unaligned_access(NULL);
+       return 0;
+}
+
+/* Measure unaligned access speed on all CPUs present at boot in parallel. */
+static int vec_check_unaligned_access_speed_all_cpus(void *unused)
+{
+       schedule_on_each_cpu(check_vector_unaligned_access);
+
+       /*
+        * Setup hotplug callbacks for any new CPUs that come online or go
+        * offline.
+        */
+       cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
+                                 riscv_online_cpu_vec, NULL);
+
+       return 0;
+}
+#else /* CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS */
+static int vec_check_unaligned_access_speed_all_cpus(void *unused)
+{
+       return 0;
+}
+#endif
+
  static int check_unaligned_access_all_cpus(void)
  {
-       bool all_cpus_emulated;
+       bool all_cpus_emulated, all_cpus_vec_supported;
         int cpu;

         if (riscv_has_extension_unlikely(RISCV_ISA_EXT_ZICCLSM)) {
@@ -285,7 +407,13 @@ static int check_unaligned_access_all_cpus(void)
         }

         all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
-       check_vector_unaligned_access_emulated_all_cpus();
+       all_cpus_vec_supported = check_vector_unaligned_access_emulated_all_cpus();
+
+       if (all_cpus_vec_supported &&

Note that with this here, if a heterogeneous system ever shows up with
a mix of supported/unsupported, the supported cpus will end up with a
misaligned vector speed of UNKNOWN. It might be nicer to change the
semantics of that return value from all_cpus_vec_supported (aka
entirely_supported) to entirely_unsupported. Then you'd only fire up
the thread if (!entirely_unsupported). If you did that, you'd also
need to bail early in the check on any CPUs that already had their
value set to UNSUPPORTED.

Fortunatly check_vector_unaligned_access already checks this.

That way we can still avoid firing up this
thread for machines that don't have misaligned V (or V at all), but
not leave the question UNKNOWN in some cases.

That makes a lot of sence I'm not sure why I didnt think ot that.
I changed it to `all_cpus_vec_unsupported` so it will check the speed if 
atleast one cpu has unaligned vector support.

Thanks,
Jesse taube


+           IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
+               kthread_run(vec_check_unaligned_access_speed_all_cpus,
+                           NULL, "vec_check_unaligned_access_speed_all_cpus");
+       }

         if (!all_cpus_emulated)
                 return check_unaligned_access_speed_all_cpus();
diff --git a/arch/riscv/kernel/vec-copy-unaligned.S b/arch/riscv/kernel/vec-copy-unaligned.S
new file mode 100644
index 000000000000..e5bc94917e60
--- /dev/null
+++ b/arch/riscv/kernel/vec-copy-unaligned.S
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2024 Rivos Inc. */
+
+#include <linux/linkage.h>
+#include <asm/asm.h>
+#include <linux/args.h>
+
+       .text
+
+#define WORD_EEW 32
+
+#define WORD_SEW CONCATENATE(e, WORD_EEW)
+#define VEC_L CONCATENATE(vle, WORD_EEW).v
+#define VEC_S CONCATENATE(vle, WORD_EEW).v
+
+/* void __riscv_copy_vec_words_unaligned(void *, const void *, size_t) */
+/* Performs a memcpy without aligning buffers, using word loads and stores. */
+/* Note: The size is truncated to a multiple of WORD_EEW */
+SYM_FUNC_START(__riscv_copy_vec_words_unaligned)
+       andi  a4, a2, ~(WORD_EEW-1)
+       beqz  a4, 2f
+       add   a3, a1, a4
+       .option push
+       .option arch, +zve32x
+1:
+       vsetivli t0, 8, WORD_SEW, m8, ta, ma
+       VEC_L v0, (a1)
+       VEC_S v0, (a0)
+       addi  a0, a0, WORD_EEW
+       addi  a1, a1, WORD_EEW
+       bltu  a1, a3, 1b
+
+2:
+       .option pop
+       ret
+SYM_FUNC_END(__riscv_copy_vec_words_unaligned)
+
+/* void __riscv_copy_vec_bytes_unaligned(void *, const void *, size_t) */
+/* Performs a memcpy without aligning buffers, using only byte accesses. */
+/* Note: The size is truncated to a multiple of 8 */
+SYM_FUNC_START(__riscv_copy_vec_bytes_unaligned)
+       andi a4, a2, ~(8-1)
+       beqz a4, 2f
+       add  a3, a1, a4
+       .option push
+       .option arch, +zve32x
+1:
+       vsetivli t0, 8, e8, m8, ta, ma
+       vle8.v v0, (a1)
+       vse8.v v0, (a0)
+       addi a0, a0, 8
+       addi a1, a1, 8
+       bltu a1, a3, 1b
+
+2:
+       .option pop
+       ret
+SYM_FUNC_END(__riscv_copy_vec_bytes_unaligned)
--
2.45.2