Re: [kvm-unit-tests PATCH v6 1/1] riscv: sbi: Add tests for HSM extension

Andrew Jones <andrew.jones@xxxxxxxxx> · Fri, 1 Nov 2024 17:37:37 +0100

On Sun, Oct 27, 2024 at 12:18:13AM +0800, James Raphael Tiovalen wrote:
> Add some tests for all of the HSM extension functions. These tests
> ensure that the HSM extension functions follow the behavior as described
> in the SBI specification.
> 
> Signed-off-by: James Raphael Tiovalen <jamestiotio@xxxxxxxxx>
> ---
>  riscv/sbi.c | 663 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 663 insertions(+)
> 
> diff --git a/riscv/sbi.c b/riscv/sbi.c
> index 6f2d3e35..b09e2e45 100644
> --- a/riscv/sbi.c
> +++ b/riscv/sbi.c
> @@ -21,6 +21,7 @@
>  #include <asm/delay.h>
>  #include <asm/io.h>
>  #include <asm/mmu.h>
> +#include <asm/page.h>
>  #include <asm/processor.h>
>  #include <asm/sbi.h>
>  #include <asm/setup.h>
> @@ -54,6 +55,11 @@ static struct sbiret sbi_dbcn_write_byte(uint8_t byte)
>  	return sbi_ecall(SBI_EXT_DBCN, SBI_EXT_DBCN_CONSOLE_WRITE_BYTE, byte, 0, 0, 0, 0, 0);
>  }
>  
> +static struct sbiret sbi_hart_suspend(uint32_t suspend_type, unsigned long resume_addr, unsigned long opaque)
> +{
> +	return sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND, suspend_type, resume_addr, opaque, 0, 0, 0);
> +}
> +
>  static struct sbiret sbi_system_suspend(uint32_t sleep_type, unsigned long resume_addr, unsigned long opaque)
>  {
>  	return sbi_ecall(SBI_EXT_SUSP, 0, sleep_type, resume_addr, opaque, 0, 0, 0);
> @@ -833,6 +839,662 @@ static void check_susp(void)
>  	report_prefix_pop();
>  }
>  
> +unsigned char sbi_hsm_stop_hart[NR_CPUS];
> +unsigned char sbi_hsm_hart_start_checks[NR_CPUS];
> +unsigned char sbi_hsm_non_retentive_hart_suspend_checks[NR_CPUS];

The above three are already present in the file. It's strange the compiler
doesn't warn about the redundant definition.

> +cpumask_t sbi_hsm_started_hart_checks;

This one can be static.

> +static bool sbi_hsm_invalid_hartid_check;
> +static bool sbi_hsm_timer_fired;
> +extern void sbi_hsm_check_hart_start(void);
> +extern void sbi_hsm_check_non_retentive_suspend(void);
> +
> +static void hsm_timer_irq_handler(struct pt_regs *regs)
> +{
> +	sbi_hsm_timer_fired = true;
> +	timer_stop();

nit: timer_stop() should be first (of course it doesn't matter much
here...)

> +}
> +
> +static void hsm_timer_setup(void)
> +{
> +	install_irq_handler(IRQ_S_TIMER, hsm_timer_irq_handler);
> +	local_irq_enable();
> +	timer_irq_enable();
> +}
> +
> +static void hsm_timer_teardown(void)
> +{
> +	timer_irq_disable();
> +	local_irq_disable();
> +	install_irq_handler(IRQ_S_TIMER, NULL);

nit: since the above functions are 'timer' functions by name, then
calling local_irq_enable/disable() is out of their scope.

> +}
> +
> +static void hart_empty_fn(void *data) {}

We have start_cpu() already in this file for a no-op function.

> +
> +static void hart_execute(void *data)

rename to 'hart_check_already_started'

> +{
> +	struct sbiret ret;
> +	unsigned long hartid = current_thread_info()->hartid;
> +	int me = smp_processor_id();
> +
> +	ret = sbi_hart_start(hartid, virt_to_phys(&hart_empty_fn), 0);
> +
> +	if (ret.error == SBI_ERR_ALREADY_AVAILABLE)
> +		cpumask_set_cpu(me, &sbi_hsm_started_hart_checks);
> +}
> +
> +static void hart_start_invalid_hartid(void *data)
> +{
> +	struct sbiret ret;
> +
> +	ret = sbi_hart_start(ULONG_MAX, virt_to_phys(&hart_empty_fn), 0);

nit: I think I prefer '-1UL' over ULONG_MAX since -1 is what is used in
the spec to refer to invalid hartids (which I interpret from the "Hart
list parameter" using -1 for hart_mask_base to indicate broadcast).

> +
> +	if (ret.error == SBI_ERR_INVALID_PARAM)
> +		sbi_hsm_invalid_hartid_check = true;
> +}
> +
> +static void hart_stop(void *data)
> +{
> +	unsigned long hartid = current_thread_info()->hartid;
> +	struct sbiret ret = sbi_hart_stop();
> +
> +	report_fail("failed to stop hart %ld (error=%ld)", hartid, ret.error);
> +}
> +
> +static void hart_retentive_suspend(void *data)
> +{
> +	unsigned long hartid = current_thread_info()->hartid;
> +	struct sbiret ret = sbi_hart_suspend(SBI_EXT_HSM_HART_SUSPEND_RETENTIVE, 0, 0);
> +
> +	if (ret.error)
> +		report_fail("failed to retentive suspend hart %ld (error=%ld)", hartid, ret.error);
> +}
> +
> +static void hart_non_retentive_suspend(void *data)
> +{
> +	unsigned long hartid = current_thread_info()->hartid;
> +
> +	/* Set opaque as hartid so that we can check a0 == a1, ensuring that a0 is hartid and a1 is opaque */
> +	struct sbiret ret = sbi_hart_suspend(SBI_EXT_HSM_HART_SUSPEND_NON_RETENTIVE,
> +					     virt_to_phys(&sbi_hsm_check_non_retentive_suspend), hartid);
> +
> +	report_fail("failed to non-retentive suspend hart %ld (error=%ld)", hartid, ret.error);
> +}
> +
> +static void hart_retentive_suspend_with_msb_set(void *data)
> +{
> +	unsigned long hartid = current_thread_info()->hartid;
> +	unsigned long suspend_type = SBI_EXT_HSM_HART_SUSPEND_RETENTIVE | (_AC(1, UL) << (__riscv_xlen - 1));
> +	struct sbiret ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND, suspend_type, 0, 0, 0, 0, 0);
> +
> +	if (ret.error)
> +		report_fail("failed to retentive suspend hart %ld with MSB set (error=%ld)", hartid, ret.error);

This test, which I assume is to ensure the SBI implementation ignores
upper bits set in suspend_type, will only work on rv64. On rv32, it
will try to do a non-retentive suspend. I see below that it's only
being run on rv64. We should put a comment here making that clear.

> +}
> +
> +static void hart_non_retentive_suspend_with_msb_set(void *data)
> +{
> +	unsigned long hartid = current_thread_info()->hartid;
> +	unsigned long suspend_type = SBI_EXT_HSM_HART_SUSPEND_NON_RETENTIVE | (_AC(1, UL) << (__riscv_xlen - 1));
> +
> +	/* Set opaque as hartid so that we can check a0 == a1, ensuring that a0 is hartid and a1 is opaque */
> +	struct sbiret ret = sbi_ecall(SBI_EXT_HSM, SBI_EXT_HSM_HART_SUSPEND, suspend_type,
> +				      virt_to_phys(&sbi_hsm_check_non_retentive_suspend), hartid, 0, 0, 0);
> +
> +	report_fail("failed to non-retentive suspend hart %ld with MSB set (error=%ld)", hartid, ret.error);

Same comment as for hart_retentive_suspend_with_msb_set()

> +}
> +
> +static bool hart_wait_on_status(unsigned long hartid, enum sbi_ext_hsm_sid status, unsigned long duration)
> +{
> +	struct sbiret ret;
> +
> +	sbi_hsm_timer_fired = false;
> +	timer_start(duration);
> +
> +	ret = sbi_hart_get_status(hartid);
> +
> +	while (!ret.error && ret.value == status && !sbi_hsm_timer_fired) {
> +		cpu_relax();
> +		ret = sbi_hart_get_status(hartid);
> +	}
> +
> +	timer_stop();
> +
> +	if (sbi_hsm_timer_fired)
> +		report_info("timer fired while waiting on status %u for hart %ld", status, hartid);
> +	else if (ret.error)
> +		report_fail("got %ld while waiting on status %u for hart %ld\n", ret.error, status, hartid);
> +
> +	return sbi_hsm_timer_fired || ret.error;

For boolean returning functions I prefer 'true' to mean success and
'false' to me failure, so I would reverse this logic.

> +}
> +
> +static void check_hsm(void)
> +{
> +	struct sbiret ret;
> +	unsigned long hartid;
> +	cpumask_t secondary_cpus_mask, hsm_start, hsm_stop, hsm_suspend, hsm_resume, hsm_check;
> +	bool ipi_unavailable = false;
> +	bool suspend_with_msb = false, resume_with_msb = false, check_with_msb = false, stop_with_msb = false;
> +	int cpu, me = smp_processor_id();
> +	int max_cpus = getenv("SBI_MAX_CPUS") ? strtol(getenv("SBI_MAX_CPUS"), NULL, 0) : nr_cpus;
> +	unsigned long hsm_timer_duration = getenv("SBI_HSM_TIMER_DURATION")
> +					 ? strtol(getenv("SBI_HSM_TIMER_DURATION"), NULL, 0) : 200000;
> +
> +	max_cpus = MIN(max_cpus, nr_cpus);

We should further clamp this to cpumask_weight(&cpu_present_mask);

> +
> +	report_prefix_push("hsm");
> +
> +	if (!sbi_probe(SBI_EXT_HSM)) {
> +		report_skip("hsm extension not available");
> +		report_prefix_pop();
> +		return;
> +	}
> +
> +	report_prefix_push("hart_get_status");
> +
> +	hartid = current_thread_info()->hartid;
> +	ret = sbi_hart_get_status(hartid);
> +
> +	if (ret.error) {
> +		report_fail("failed to get status of current hart (error=%ld)", ret.error);
> +		report_prefix_popn(2);
> +		return;
> +	} else if (ret.value != SBI_EXT_HSM_STARTED) {
> +		report_fail("current hart is not started (ret.value=%ld)", ret.value);
> +		report_prefix_popn(2);
> +		return;
> +	}
> +
> +	report_pass("status of current hart is started");
> +
> +	for_each_present_cpu(cpu) {
> +		if (sbi_hart_get_status(cpus[cpu].hartid).error == SBI_ERR_INVALID_PARAM)
> +			set_cpu_present(cpu, false);
> +	}

We do this now in setup since commit b9e849027e0d ("riscv: Filter unmanaged
harts from present mask") which is merged into riscv/sbi.

> +
> +	report(cpumask_weight(&cpu_present_mask) == nr_cpus, "all present harts have valid hartids");

The above test should be dropped. It wouldn't be testing SBI it would be
testing the DT.

> +
> +	report_prefix_pop();
> +
> +	if (max_cpus < 2) {
> +		report_skip("no other cpus to run the remaining hsm tests on");
> +		report_prefix_pop();
> +		return;
> +	}
> +
> +	report_prefix_push("hart_stop");
> +
> +	cpumask_copy(&secondary_cpus_mask, &cpu_present_mask);
> +	cpumask_clear_cpu(me, &secondary_cpus_mask);
> +	hsm_timer_setup();
> +
> +	/* Assume that previous tests have not cleaned up and stopped the secondary harts */
> +	on_cpumask_async(&secondary_cpus_mask, hart_stop, NULL);
> +
> +	cpumask_clear(&hsm_stop);
> +
> +	for_each_cpu(cpu, &secondary_cpus_mask) {
> +		hartid = cpus[cpu].hartid;
> +		if (hart_wait_on_status(hartid, SBI_EXT_HSM_STARTED, hsm_timer_duration))
> +			continue;
> +		if (hart_wait_on_status(hartid, SBI_EXT_HSM_STOP_PENDING, hsm_timer_duration))
> +			continue;
> +
> +		ret = sbi_hart_get_status(hartid);
> +		if (ret.error)
> +			report_info("hart %ld get status failed (error=%ld)", hartid, ret.error);
> +		else if (ret.value != SBI_EXT_HSM_STOPPED)
> +			report_info("hart %ld status is not 'stopped' (ret.value=%ld)", hartid, ret.value);
> +		else
> +			cpumask_set_cpu(cpu, &hsm_stop);
> +	}
> +
> +	report(cpumask_weight(&hsm_stop) == max_cpus - 1, "all secondary harts stopped");

It looks like we just need a counter and not a cpumask for this test since
we only care about the weight of the mask.

> +
> +	report_prefix_pop();
> +
> +	report_prefix_push("hart_start");
> +
> +	cpumask_clear(&hsm_start);
> +	cpumask_clear(&hsm_check);
> +
> +	for_each_cpu(cpu, &secondary_cpus_mask) {
> +		hartid = cpus[cpu].hartid;
> +		/* Set opaque as hartid so that we can check a0 == a1, ensuring that a0 is hartid and a1 is opaque */

The only problem with this trick is that the SBI implementation could have
a bug that sets a0 to opaque and a1 to hartid and we won't catch it. It
would be better to embed hartid in some structure or array (and not at
offset 0) and then set opaque to the pointer of that structure. That's how
the SUSP test does it. Also SUSP puts a magic number at offset zero so
opaque can be tested by that alone. To change it we'll need to also write
a fixup patch for "riscv: sbi: Provide entry point for HSM tests".

> +		ret = sbi_hart_start(hartid, virt_to_phys(&sbi_hsm_check_hart_start), hartid);
> +		if (ret.error) {
> +			report_fail("failed to start test hart %ld (error=%ld)", hartid, ret.error);
> +			continue;
> +		}
> +