Hey again Thomas, On Thu, May 05, 2022 at 01:02:02PM +0200, Jason A. Donenfeld wrote: > Interestingly, disabling the simd paths makes things around 283 cycles > slower on my Tiger Lake laptop, just doing ordinary things. I'm actually > slightly surprised, so I'll probably keep playing with this. My patch > for this is attached. Let me know if you have a different methodology in > mind... Using RDPMC/perf, the performance is shown to be even closer for real world cases, with the simd code only ~80 cycles faster. Bench code follows below. If the observation on this hardware holds for other hardware, we can probably improve the performance of the generic code a bit, and then the difference really won't matter. Any thoughts about this and the test code? Jason diff --git a/drivers/char/random.c b/drivers/char/random.c index bd292927654c..6577e9f2f3b7 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -53,6 +53,7 @@ #include <linux/uuid.h> #include <linux/uaccess.h> #include <linux/suspend.h> +#include <linux/sort.h> #include <crypto/chacha.h> #include <crypto/blake2s.h> #include <asm/processor.h> @@ -755,9 +756,54 @@ static struct { .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock), }; +static DEFINE_PER_CPU(int, pmc_index) = -1; +static struct { + u32 durations[1 << 20]; + u32 pos, len; +} irqbench; + static void _mix_pool_bytes(const void *in, size_t nbytes) { + int idx = *this_cpu_ptr(&pmc_index); + u32 ctr = input_pool.hash.t[0], reg = 0; + cycles_t end, start; + + + native_cpuid(®, ®, ®, ®); + start = idx == -1 ? 0 : native_read_pmc(idx); blake2s_update(&input_pool.hash, in, nbytes); + end = idx == -1 ? 0 : native_read_pmc(idx); + + if (ctr == input_pool.hash.t[0] || !in_hardirq() || idx == -1) + return; + + irqbench.durations[irqbench.pos++ % ARRAY_SIZE(irqbench.durations)] = end - start; + irqbench.len = min_t(u32, irqbench.len + 1, ARRAY_SIZE(irqbench.durations)); +} + +static int cmp_u32(const void *a, const void *b) +{ + return *(const u32 *)a - *(const u32 *)b; +} + +static int proc_do_irqbench_median(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + u32 len = READ_ONCE(irqbench.len), median, *sorted; + struct ctl_table fake_table = { + .data = &median, + .maxlen = sizeof(median) + }; + if (!len) + return -ENODATA; + sorted = kmalloc_array(len, sizeof(*sorted), GFP_KERNEL); + if (!sorted) + return -ENOMEM; + memcpy(sorted, irqbench.durations, len * sizeof(*sorted)); + sort(sorted, len, sizeof(*sorted), cmp_u32, NULL); + median = sorted[len / 2]; + kfree(sorted); + return write ? 0 : proc_douintvec(&fake_table, 0, buffer, lenp, ppos); } /* @@ -1709,6 +1755,18 @@ static struct ctl_table random_table[] = { .mode = 0444, .proc_handler = proc_do_uuid, }, + { + .procname = "irqbench_median", + .mode = 0444, + .proc_handler = proc_do_irqbench_median, + }, + { + .procname = "irqbench_count", + .data = &irqbench.len, + .maxlen = sizeof(irqbench.len), + .mode = 0444, + .proc_handler = proc_douintvec, + }, { } }; @@ -1718,6 +1776,21 @@ static struct ctl_table random_table[] = { */ static int __init random_sysctls_init(void) { + int i; + struct perf_event *cycles_event; + struct perf_event_attr perf_cycles_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = true + }; + for_each_possible_cpu(i) { + cycles_event = perf_event_create_kernel_counter(&perf_cycles_attr, i, NULL, NULL, NULL); + if (IS_ERR(cycles_event)) + pr_err("unable to create perf counter on cpu %d: %ld\n", i, PTR_ERR(cycles_event)); + else + *per_cpu_ptr(&pmc_index, i) = cycles_event->hw.event_base_rdpmc; + } register_sysctl_init("kernel/random", random_table); return 0; }