Re: [RFC PATCH v2 2/5] kvm: Implement the paravirt sched framework for kvm

Vineeth Remanan Pillai <vineeth@xxxxxxxxxxxxxxx> · Mon, 8 Apr 2024 09:58:32 -0400



Adding sched_ext folks

On Wed, Apr 3, 2024 at 10:01 AM Vineeth Pillai (Google)
<vineeth@xxxxxxxxxxxxxxx> wrote:
>
> kvm uses the kernel's paravirt sched framework to assign an available
> pvsched driver for a guest. guest vcpus registers with the pvsched
> driver and calls into the driver callback to notify the events that the
> driver is interested in.
>
> This PoC doesn't do the callback on interrupt injection yet. Will be
> implemented in subsequent iterations.
>
> Signed-off-by: Vineeth Pillai (Google) <vineeth@xxxxxxxxxxxxxxx>
> Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
> ---
>  arch/x86/kvm/Kconfig     |  13 ++++
>  arch/x86/kvm/x86.c       |   3 +
>  include/linux/kvm_host.h |  32 +++++++++
>  virt/kvm/kvm_main.c      | 148 +++++++++++++++++++++++++++++++++++++++
>  4 files changed, 196 insertions(+)
>
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index 65ed14b6540b..c1776cdb5b65 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -189,4 +189,17 @@ config KVM_MAX_NR_VCPUS
>           the memory footprint of each KVM guest, regardless of how many vCPUs are
>           created for a given VM.
>
> +config PARAVIRT_SCHED_KVM
> +       bool "Enable paravirt scheduling capability for kvm"
> +       depends on KVM
> +       default n
> +       help
> +         Paravirtualized scheduling facilitates the exchange of scheduling
> +         related information between the host and guest through shared memory,
> +         enhancing the efficiency of vCPU thread scheduling by the hypervisor.
> +         An illustrative use case involves dynamically boosting the priority of
> +         a vCPU thread when the guest is executing a latency-sensitive workload
> +         on that specific vCPU.
> +         This config enables paravirt scheduling in the kvm hypervisor.
> +
>  endif # VIRTUALIZATION
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index ffe580169c93..d0abc2c64d47 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -10896,6 +10896,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>
>         preempt_disable();
>
> +       kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMENTER);
> +
>         static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
>
>         /*
> @@ -11059,6 +11061,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>         guest_timing_exit_irqoff();
>
>         local_irq_enable();
> +       kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_VMEXIT);
>         preempt_enable();
>
>         kvm_vcpu_srcu_read_lock(vcpu);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 179df96b20f8..6381569f3de8 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -45,6 +45,8 @@
>  #include <asm/kvm_host.h>
>  #include <linux/kvm_dirty_ring.h>
>
> +#include <linux/pvsched.h>
> +
>  #ifndef KVM_MAX_VCPU_IDS
>  #define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS
>  #endif
> @@ -832,6 +834,11 @@ struct kvm {
>         bool vm_bugged;
>         bool vm_dead;
>
> +#ifdef CONFIG_PARAVIRT_SCHED_KVM
> +       spinlock_t pvsched_ops_lock;
> +       struct pvsched_vcpu_ops __rcu *pvsched_ops;
> +#endif
> +
>  #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
>         struct notifier_block pm_notifier;
>  #endif
> @@ -2413,4 +2420,29 @@ static inline int kvm_gmem_get_pfn(struct kvm *kvm,
>  }
>  #endif /* CONFIG_KVM_PRIVATE_MEM */
>
> +#ifdef CONFIG_PARAVIRT_SCHED_KVM
> +int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events);
> +int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu);
> +void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu);
> +
> +int kvm_replace_pvsched_ops(struct kvm *kvm, char *name);
> +#else
> +static inline int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events)
> +{
> +       return 0;
> +}
> +static inline int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu)
> +{
> +       return 0;
> +}
> +static inline void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu)
> +{
> +}
> +
> +static inline int kvm_replace_pvsched_ops(struct kvm *kvm, char *name)
> +{
> +       return 0;
> +}
> +#endif
> +
>  #endif
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 0f50960b0e3a..0546814e4db7 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -170,6 +170,142 @@ bool kvm_is_zone_device_page(struct page *page)
>         return is_zone_device_page(page);
>  }
>
> +#ifdef CONFIG_PARAVIRT_SCHED_KVM
> +typedef enum {
> +       PVSCHED_CB_REGISTER = 1,
> +       PVSCHED_CB_UNREGISTER = 2,
> +       PVSCHED_CB_NOTIFY = 3
> +} pvsched_vcpu_callback_t;
> +
> +/*
> + * Helper function to invoke the pvsched driver callback.
> + */
> +static int __vcpu_pvsched_callback(struct kvm_vcpu *vcpu, u32 events,
> +               pvsched_vcpu_callback_t action)
> +{
> +       int ret = 0;
> +       struct pid *pid;
> +       struct pvsched_vcpu_ops *ops;
> +
> +       rcu_read_lock();
> +       ops = rcu_dereference(vcpu->kvm->pvsched_ops);
> +       if (!ops) {
> +               ret = -ENOENT;
> +               goto out;
> +       }
> +
> +       pid = rcu_dereference(vcpu->pid);
> +       if (WARN_ON_ONCE(!pid)) {
> +               ret = -EINVAL;
> +               goto out;
> +       }
> +       get_pid(pid);
> +       switch(action) {
> +               case PVSCHED_CB_REGISTER:
> +                       ops->pvsched_vcpu_register(pid);
> +                       break;
> +               case PVSCHED_CB_UNREGISTER:
> +                       ops->pvsched_vcpu_unregister(pid);
> +                       break;
> +               case PVSCHED_CB_NOTIFY:
> +                       if (ops->events & events) {
> +                               ops->pvsched_vcpu_notify_event(
> +                                       NULL, /* TODO: Pass guest allocated sharedmem addr */
> +                                       pid,
> +                                       ops->events & events);
> +                       }
> +                       break;
> +               default:
> +                       WARN_ON_ONCE(1);
> +       }
> +       put_pid(pid);
> +
> +out:
> +       rcu_read_unlock();
> +       return ret;
> +}
> +
> +int kvm_vcpu_pvsched_notify(struct kvm_vcpu *vcpu, u32 events)
> +{
> +       return __vcpu_pvsched_callback(vcpu, events, PVSCHED_CB_NOTIFY);
> +}
> +
> +int kvm_vcpu_pvsched_register(struct kvm_vcpu *vcpu)
> +{
> +       return __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_REGISTER);
> +       /*
> +        * TODO: Action if the registration fails?
> +        */
> +}
> +
> +void kvm_vcpu_pvsched_unregister(struct kvm_vcpu *vcpu)
> +{
> +       __vcpu_pvsched_callback(vcpu, 0, PVSCHED_CB_UNREGISTER);
> +}
> +
> +/*
> + * Replaces the VM's current pvsched driver.
> + * if name is NULL or empty string, unassign the
> + * current driver.
> + */
> +int kvm_replace_pvsched_ops(struct kvm *kvm, char *name)
> +{
> +       int ret = 0;
> +       unsigned long i;
> +       struct kvm_vcpu *vcpu = NULL;
> +       struct pvsched_vcpu_ops *ops = NULL, *prev_ops;
> +
> +
> +       spin_lock(&kvm->pvsched_ops_lock);
> +
> +       prev_ops = rcu_dereference(kvm->pvsched_ops);
> +
> +       /*
> +        * Unassign operation if the passed in value is
> +        * NULL or an empty string.
> +        */
> +       if (name && *name) {
> +               ops = pvsched_get_vcpu_ops(name);
> +               if (!ops) {
> +                       ret = -EINVAL;
> +                       goto out;
> +               }
> +       }
> +
> +       if (prev_ops) {
> +               /*
> +                * Unregister current pvsched driver.
> +                */
> +               kvm_for_each_vcpu(i, vcpu, kvm) {
> +                       kvm_vcpu_pvsched_unregister(vcpu);
> +               }
> +
> +               pvsched_put_vcpu_ops(prev_ops);
> +       }
> +
> +
> +       rcu_assign_pointer(kvm->pvsched_ops, ops);
> +       if (ops) {
> +               /*
> +                * Register new pvsched driver.
> +                */
> +               kvm_for_each_vcpu(i, vcpu, kvm) {
> +                       WARN_ON_ONCE(kvm_vcpu_pvsched_register(vcpu));
> +               }
> +       }
> +
> +out:
> +       spin_unlock(&kvm->pvsched_ops_lock);
> +
> +       if (ret)
> +               return ret;
> +
> +       synchronize_rcu();
> +
> +       return 0;
> +}
> +#endif
> +
>  /*
>   * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
>   * page, NULL otherwise.  Note, the list of refcounted PG_reserved page types
> @@ -508,6 +644,8 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
>         kvm_arch_vcpu_destroy(vcpu);
>         kvm_dirty_ring_free(&vcpu->dirty_ring);
>
> +       kvm_vcpu_pvsched_unregister(vcpu);
> +
>         /*
>          * No need for rcu_read_lock as VCPU_RUN is the only place that changes
>          * the vcpu->pid pointer, and at destruction time all file descriptors
> @@ -1221,6 +1359,10 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
>
>         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
>
> +#ifdef CONFIG_PARAVIRT_SCHED_KVM
> +       spin_lock_init(&kvm->pvsched_ops_lock);
> +#endif
> +
>         /*
>          * Force subsequent debugfs file creations to fail if the VM directory
>          * is not created (by kvm_create_vm_debugfs()).
> @@ -1343,6 +1485,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
>         int i;
>         struct mm_struct *mm = kvm->mm;
>
> +       kvm_replace_pvsched_ops(kvm, NULL);
> +
>         kvm_destroy_pm_notifier(kvm);
>         kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
>         kvm_destroy_vm_debugfs(kvm);
> @@ -3779,6 +3923,8 @@ bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
>                 if (kvm_vcpu_check_block(vcpu) < 0)
>                         break;
>
> +               kvm_vcpu_pvsched_notify(vcpu, PVSCHED_VCPU_HALT);
> +
>                 waited = true;
>                 schedule();
>         }
> @@ -4434,6 +4580,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
>                         /* The thread running this VCPU changed. */
>                         struct pid *newpid;
>
> +                       kvm_vcpu_pvsched_unregister(vcpu);
>                         r = kvm_arch_vcpu_run_pid_change(vcpu);
>                         if (r)
>                                 break;
> @@ -4442,6 +4589,7 @@ static long kvm_vcpu_ioctl(struct file *filp,
>                         rcu_assign_pointer(vcpu->pid, newpid);
>                         if (oldpid)
>                                 synchronize_rcu();
> +                       kvm_vcpu_pvsched_register(vcpu);
>                         put_pid(oldpid);
>                 }
>                 r = kvm_arch_vcpu_ioctl_run(vcpu);
> --
> 2.40.1
>