Adding sched_ext folks On Wed, Apr 3, 2024 at 10:01 AM Vineeth Pillai (Google) <vineeth@xxxxxxxxxxxxxxx> wrote: > > kvm uses the kernel's paravirt sched framework to assign an available > pvsched driver for a guest. guest vcpus registers with the pvsched > driver and calls into the driver callback to notify the events that the > driver is interested in. > > This PoC doesn't do the callback on interrupt injection yet. Will be > implemented in subsequent iterations. > > Signed-off-by: Vineeth Pillai (Google) <vineeth@xxxxxxxxxxxxxxx> > Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx> > --- > arch/x86/kvm/Kconfig | 13 ++++ > arch/x86/kvm/x86.c | 3 + > include/linux/kvm_host.h | 32 +++++++++ > virt/kvm/kvm_main.c | 148 +++++++++++++++++++++++++++++++++++++++ > 4 files changed, 196 insertions(+) > > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig > index 65ed14b6540b..c1776cdb5b65 100644 > --- a/arch/x86/kvm/Kconfig > +++ b/arch/x86/kvm/Kconfig > @@ -189,4 +189,17 @@ config KVM_MAX_NR_VCPUS > the memory footprint of each KVM guest, regardless of how many vCPUs are > created for a given VM. > > +config PARAVIRT_SCHED_KVM > + bool "Enable paravirt scheduling capability for kvm" > + depends on KVM > + default n > + help > + Paravirtualized scheduling facilitates the exchange of scheduling > + related information between the host and guest through shared memory, > + enhancing the efficiency of vCPU thread scheduling by the hypervisor. > + An illustrative use case involves dynamically boosting the priority of > + a vCPU thread when the guest is executing a latency-sensitive workload > + on that specific vCPU. > + This config enables paravirt scheduling in the kvm hypervisor. > + > endif # VIRTUALIZATION > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index ffe580169c93..d0abc2c64d47 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -10896,6 +10896,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > > preempt_disable(); > > + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMENTER); > + > static_call(kvm_x86_prepare_switch_to_guest)(vcpu); > > /* > @@ -11059,6 +11061,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > guest_timing_exit_irqoff(); > > local_irq_enable(); > + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMEXIT); > preempt_enable(); > > kvm_vcpu_srcu_read_lock(vcpu); > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 179df96b20f8..6381569f3de8 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -45,6 +45,8 @@ > #include <asm/kvm_host.h> > #include <linux/kvm_dirty_ring.h> > > +#include <linux/pvsched.h> > + > #ifndef KVM_MAX_VCPU_IDS > #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS > #endif > @@ -832,6 +834,11 @@ struct kvm { > bool vm_bugged; > bool vm_dead; > > +#ifdef CONFIG_PARAVIRT_SCHED_KVM > + spinlock_t pvsched_ops_lock; > + struct pvsched_vcpu_ops __rcu *pvsched_ops; > +#endif > + > #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER > struct notifier_block pm_notifier; > #endif > @@ -2413,4 +2420,29 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm, > } > #endif /* CONFIG_KVM_PRIVATE_MEM */ > > +#ifdef CONFIG_PARAVIRT_SCHED_KVM > +int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events); > +int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu); > +void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu); > + > +int kvm_replace_pvsched_ops(struct kvm *kvm, char *name); > +#else > +static inline int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events) > +{ > + return 0; > +} > +static inline int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu) > +{ > + return 0; > +} > +static inline void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu) > +{ > +} > + > +static inline int kvm_replace_pvsched_ops(struct kvm *kvm, char *name) > +{ > + return 0; > +} > +#endif > + > #endif > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 0f50960b0e3a..0546814e4db7 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -170,6 +170,142 @@ bool kvm_is_zone_device_page(struct page *page) > return is_zone_device_page(page); > } > > +#ifdef CONFIG_PARAVIRT_SCHED_KVM > +typedef enum { > + PVSCHED_CB_REGISTER = 1, > + PVSCHED_CB_UNREGISTER = 2, > + PVSCHED_CB_NOTIFY = 3 > +} pvsched_vcpu_callback_t; > + > +/* > + * Helper function to invoke the pvsched driver callback. > + */ > +static int __vcpu_pvsched_callback(struct kvm_vcpu *vcpu, u32 events, > + pvsched_vcpu_callback_t action) > +{ > + int ret = 0; > + struct pid *pid; > + struct pvsched_vcpu_ops *ops; > + > + rcu_read_lock(); > + ops = rcu_dereference(vcpu->kvm->pvsched_ops); > + if (!ops) { > + ret = -ENOENT; > + goto out; > + } > + > + pid = rcu_dereference(vcpu->pid); > + if (WARN_ON_ONCE(!pid)) { > + ret = -EINVAL; > + goto out; > + } > + get_pid(pid); > + switch(action) { > + case PVSCHED_CB_REGISTER: > + ops->pvsched_vcpu_register(pid); > + break; > + case PVSCHED_CB_UNREGISTER: > + ops->pvsched_vcpu_unregister(pid); > + break; > + case PVSCHED_CB_NOTIFY: > + if (ops->events & events) { > + ops->pvsched_vcpu_notify_event( > + NULL, /* TODO: Pass guest allocated sharedmem addr */ > + pid, > + ops->events & events); > + } > + break; > + default: > + WARN_ON_ONCE(1); > + } > + put_pid(pid); > + > +out: > + rcu_read_unlock(); > + return ret; > +} > + > +int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events) > +{ > + return __vcpu_pvsched_callback(vcpu, events, PVSCHED_CB_NOTIFY); > +} > + > +int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu) > +{ > + return __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_REGISTER); > + /* > + * TODO: Action if the registration fails? > + */ > +} > + > +void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu) > +{ > + __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_UNREGISTER); > +} > + > +/* > + * Replaces the VM's current pvsched driver. > + * if name is NULL or empty string, unassign the > + * current driver. > + */ > +int kvm_replace_pvsched_ops(struct kvm *kvm, char *name) > +{ > + int ret = 0; > + unsigned long i; > + struct kvm_vcpu *vcpu = NULL; > + struct pvsched_vcpu_ops *ops = NULL, *prev_ops; > + > + > + spin_lock(&kvm->pvsched_ops_lock); > + > + prev_ops = rcu_dereference(kvm->pvsched_ops); > + > + /* > + * Unassign operation if the passed in value is > + * NULL or an empty string. > + */ > + if (name && *name) { > + ops = pvsched_get_vcpu_ops(name); > + if (!ops) { > + ret = -EINVAL; > + goto out; > + } > + } > + > + if (prev_ops) { > + /* > + * Unregister current pvsched driver. > + */ > + kvm_for_each_vcpu(i, vcpu, kvm) { > + kvm_vcpu_pvsched_unregister(vcpu); > + } > + > + pvsched_put_vcpu_ops(prev_ops); > + } > + > + > + rcu_assign_pointer(kvm->pvsched_ops, ops); > + if (ops) { > + /* > + * Register new pvsched driver. > + */ > + kvm_for_each_vcpu(i, vcpu, kvm) { > + WARN_ON_ONCE(kvm_vcpu_pvsched_register(vcpu)); > + } > + } > + > +out: > + spin_unlock(&kvm->pvsched_ops_lock); > + > + if (ret) > + return ret; > + > + synchronize_rcu(); > + > + return 0; > +} > +#endif > + > /* > * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted > * page, NULL otherwise. Note, the list of refcounted PG_reserved page types > @@ -508,6 +644,8 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) > kvm_arch_vcpu_destroy(vcpu); > kvm_dirty_ring_free(&vcpu->dirty_ring); > > + kvm_vcpu_pvsched_unregister(vcpu); > + > /* > * No need for rcu_read_lock as VCPU_RUN is the only place that changes > * the vcpu->pid pointer, and at destruction time all file descriptors > @@ -1221,6 +1359,10 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) > > BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); > > +#ifdef CONFIG_PARAVIRT_SCHED_KVM > + spin_lock_init(&kvm->pvsched_ops_lock); > +#endif > + > /* > * Force subsequent debugfs file creations to fail if the VM directory > * is not created (by kvm_create_vm_debugfs()). > @@ -1343,6 +1485,8 @@ static void kvm_destroy_vm(struct kvm *kvm) > int i; > struct mm_struct *mm = kvm->mm; > > + kvm_replace_pvsched_ops(kvm, NULL); > + > kvm_destroy_pm_notifier(kvm); > kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); > kvm_destroy_vm_debugfs(kvm); > @@ -3779,6 +3923,8 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu) > if (kvm_vcpu_check_block(vcpu) < 0) > break; > > + kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_HALT); > + > waited = true; > schedule(); > } > @@ -4434,6 +4580,7 @@ static long kvm_vcpu_ioctl(struct file *filp, > /* The thread running this VCPU changed. */ > struct pid *newpid; > > + kvm_vcpu_pvsched_unregister(vcpu); > r = kvm_arch_vcpu_run_pid_change(vcpu); > if (r) > break; > @@ -4442,6 +4589,7 @@ static long kvm_vcpu_ioctl(struct file *filp, > rcu_assign_pointer(vcpu->pid, newpid); > if (oldpid) > synchronize_rcu(); > + kvm_vcpu_pvsched_register(vcpu); > put_pid(oldpid); > } > r = kvm_arch_vcpu_ioctl_run(vcpu); > -- > 2.40.1 >