Re: [patch 3/3] x86/fpu: Make FPU protection more robust

"Jason A. Donenfeld" <Jason@xxxxxxxxx> · Thu, 5 May 2022 15:48:55 +0200

Hey again Thomas,

On Thu, May 05, 2022 at 01:02:02PM +0200, Jason A. Donenfeld wrote:
> Interestingly, disabling the simd paths makes things around 283 cycles
> slower on my Tiger Lake laptop, just doing ordinary things. I'm actually
> slightly surprised, so I'll probably keep playing with this. My patch
> for this is attached. Let me know if you have a different methodology in
> mind...

Using RDPMC/perf, the performance is shown to be even closer for real
world cases, with the simd code only ~80 cycles faster. Bench code
follows below. If the observation on this hardware holds for other
hardware, we can probably improve the performance of the generic code a
bit, and then the difference really won't matter. Any thoughts about
this and the test code?

Jason

diff --git a/drivers/char/random.c b/drivers/char/random.c
index bd292927654c..6577e9f2f3b7 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -53,6 +53,7 @@
 #include <linux/uuid.h>
 #include <linux/uaccess.h>
 #include <linux/suspend.h>
+#include <linux/sort.h>
 #include <crypto/chacha.h>
 #include <crypto/blake2s.h>
 #include <asm/processor.h>
@@ -755,9 +756,54 @@ static struct {
 	.lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
 };
 
+static DEFINE_PER_CPU(int, pmc_index) = -1;
+static struct {
+	u32 durations[1 << 20];
+	u32 pos, len;
+} irqbench;
+
 static void _mix_pool_bytes(const void *in, size_t nbytes)
 {
+	int idx = *this_cpu_ptr(&pmc_index);
+	u32 ctr = input_pool.hash.t[0], reg = 0;
+	cycles_t end, start;
+
+
+	native_cpuid(&reg, &reg, &reg, &reg);
+	start = idx == -1 ? 0 : native_read_pmc(idx);
 	blake2s_update(&input_pool.hash, in, nbytes);
+	end = idx == -1 ? 0 : native_read_pmc(idx);
+
+	if (ctr == input_pool.hash.t[0] || !in_hardirq() || idx == -1)
+		return;
+
+	irqbench.durations[irqbench.pos++ % ARRAY_SIZE(irqbench.durations)] = end - start;
+	irqbench.len = min_t(u32, irqbench.len + 1, ARRAY_SIZE(irqbench.durations));
+}
+
+static int cmp_u32(const void *a, const void *b)
+{
+	return *(const u32 *)a - *(const u32 *)b;
+}
+
+static int proc_do_irqbench_median(struct ctl_table *table, int write, void *buffer,
+				   size_t *lenp, loff_t *ppos)
+{
+	u32 len = READ_ONCE(irqbench.len), median, *sorted;
+	struct ctl_table fake_table = {
+		.data = &median,
+		.maxlen = sizeof(median)
+	};
+	if (!len)
+		return -ENODATA;
+	sorted = kmalloc_array(len, sizeof(*sorted), GFP_KERNEL);
+	if (!sorted)
+		return -ENOMEM;
+	memcpy(sorted, irqbench.durations, len * sizeof(*sorted));
+	sort(sorted, len, sizeof(*sorted), cmp_u32, NULL);
+	median = sorted[len / 2];
+	kfree(sorted);
+	return write ? 0 : proc_douintvec(&fake_table, 0, buffer, lenp, ppos);
 }
 
 /*
@@ -1709,6 +1755,18 @@ static struct ctl_table random_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_do_uuid,
 	},
+	{
+		.procname	= "irqbench_median",
+		.mode		= 0444,
+		.proc_handler	= proc_do_irqbench_median,
+	},
+	{
+		.procname	= "irqbench_count",
+		.data		= &irqbench.len,
+		.maxlen		= sizeof(irqbench.len),
+		.mode		= 0444,
+		.proc_handler	= proc_douintvec,
+	},
 	{ }
 };
 
@@ -1718,6 +1776,21 @@ static struct ctl_table random_table[] = {
  */
 static int __init random_sysctls_init(void)
 {
+	int i;
+	struct perf_event *cycles_event;
+	struct perf_event_attr perf_cycles_attr = {
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+		.size = sizeof(struct perf_event_attr),
+		.pinned = true
+	};
+	for_each_possible_cpu(i) {
+		cycles_event = perf_event_create_kernel_counter(&perf_cycles_attr, i, NULL, NULL, NULL);
+		if (IS_ERR(cycles_event))
+			pr_err("unable to create perf counter on cpu %d: %ld\n", i, PTR_ERR(cycles_event));
+		else
+			*per_cpu_ptr(&pmc_index, i) = cycles_event->hw.event_base_rdpmc;
+	}
 	register_sysctl_init("kernel/random", random_table);
 	return 0;
 }