Add a KVM_CAP to control WFx (WFI or WFE) trapping based on scheduler runqueue depth. This is so they can be passed through if the runqueue is shallow or the CPU has support for direct interrupt injection. They may be always trapped by setting this value to 0. Technically this means traps will be cleared when the runqueue depth is 0, but that implies nothing is running anyway so there is no reason to care. The default value is 1 to preserve previous behavior before adding this option. Think about his option as a threshold. The instruction will be trapped if the runqueue depth is higher than the threshold. Signed-off-by: Colton Lewis <coltonlewis@xxxxxxxxxx> --- v2: The last version was exclusively a flag to enable unconditional wfx passthrough but there was feedback to make passthrough/trapping depend on runqueue depth. I asked the last thread if there were any preferences for the interface to accomplish this but I figured it's easier to show code than wait for people telling me what to do. v1: https://lore.kernel.org/kvmarm/20240129213918.3124494-1-coltonlewis@xxxxxxxxxx/ arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 7 ++++++- include/linux/sched/stat.h | 1 + include/uapi/linux/kvm.h | 2 +- kernel/sched/core.c | 15 +++++++++++++-- 5 files changed, 22 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 21c57b812569..79f461efaa6c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -317,6 +317,7 @@ struct kvm_arch { * the associated pKVM instance in the hypervisor. */ struct kvm_protected_vm pkvm; + u64 wfx_trap_runqueue_depth; }; struct kvm_vcpu_fault_info { diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a25265aca432..419eed6e1814 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -116,6 +116,9 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->slots_lock); break; + case KVM_CAP_ARM_WFX_TRAP_RUNQUEUE_DEPTH: + kvm->arch.wfx_trap_runqueue_depth = cap->args[0]; + break; default: r = -EINVAL; break; @@ -176,6 +179,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); + kvm->arch.wfx_trap_runqueue_depth = 1; return 0; err_free_cpumask: @@ -240,6 +244,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_SYSTEM_SUSPEND: case KVM_CAP_IRQFD_RESAMPLE: case KVM_CAP_COUNTER_OFFSET: + case KVM_CAP_ARM_WFX_TRAP_RUNQUEUE_DEPTH: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -456,7 +461,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (kvm_arm_is_pvtime_enabled(&vcpu->arch)) kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu); - if (single_task_running()) + if (nr_running_this_cpu() <= vcpu->kvm->arch.wfx_trap_runqueue_depth) vcpu_clear_wfx_traps(vcpu); else vcpu_set_wfx_traps(vcpu); diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 0108a38bb64d..dc1541fcec56 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -18,6 +18,7 @@ extern int nr_threads; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned int nr_running(void); +extern unsigned int nr_running_this_cpu(void); extern bool single_task_running(void); extern unsigned int nr_iowait(void); extern unsigned int nr_iowait_cpu(int cpu); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index c3308536482b..4c0ebf514c03 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1155,6 +1155,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_MEMORY_ATTRIBUTES 233 #define KVM_CAP_GUEST_MEMFD 234 #define KVM_CAP_VM_TYPES 235 +#define KVM_CAP_ARM_WFX_TRAP_RUNQUEUE_DEPTH 236 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9116bcc90346..b18f29964648 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5420,7 +5420,7 @@ unsigned int nr_running(void) } /* - * Check if only the current task is running on the CPU. + * Return number of tasks running on this CPU. * * Caution: this function does not check that the caller has disabled * preemption, thus the result might have a time-of-check-to-time-of-use @@ -5432,9 +5432,20 @@ unsigned int nr_running(void) * * - in a loop with very short iterations (e.g. a polling loop) */ +unsigned int nr_running_this_cpu(void) +{ + return raw_rq()->nr_running; +} +EXPORT_SYMBOL(nr_running_this_cpu); + +/* + * Check if only the current task is running on the CPU. + * + * Caution: see warning for nr_running_this_cpu + */ bool single_task_running(void) { - return raw_rq()->nr_running == 1; + return nr_running_this_cpu() == 1; } EXPORT_SYMBOL(single_task_running); -- 2.44.0.291.gc1ea87d7ee-goog