Adding sched_ext folks On Wed, Apr 3, 2024 at 10:01 AM Vineeth Pillai (Google) <vineeth@xxxxxxxxxxxxxxx> wrote: > > Implement a paravirt scheduling framework for linux kernel. > > The framework allows for pvsched driver to register to the kernel and > receive callbacks from hypervisor(eg: kvm) for interested vcpu events > like VMENTER, VMEXIT etc. > > The framework also allows hypervisor to select a pvsched driver (from > the available list of registered drivers) for each guest. > > Also implement a sysctl for listing the available pvsched drivers. > > Signed-off-by: Vineeth Pillai (Google) <vineeth@xxxxxxxxxxxxxxx> > Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx> > --- > Kconfig | 2 + > include/linux/pvsched.h | 102 +++++++++++++++++++ > kernel/sysctl.c | 27 +++++ > virt/Makefile | 2 +- > virt/pvsched/Kconfig | 12 +++ > virt/pvsched/Makefile | 2 + > virt/pvsched/pvsched.c | 215 ++++++++++++++++++++++++++++++++++++++++ > 7 files changed, 361 insertions(+), 1 deletion(-) > create mode 100644 include/linux/pvsched.h > create mode 100644 virt/pvsched/Kconfig > create mode 100644 virt/pvsched/Makefile > create mode 100644 virt/pvsched/pvsched.c > > diff --git a/Kconfig b/Kconfig > index 745bc773f567..4a52eaa21166 100644 > --- a/Kconfig > +++ b/Kconfig > @@ -29,4 +29,6 @@ source "lib/Kconfig" > > source "lib/Kconfig.debug" > > +source "virt/pvsched/Kconfig" > + > source "Documentation/Kconfig" > diff --git a/include/linux/pvsched.h b/include/linux/pvsched.h > new file mode 100644 > index 000000000000..59df6b44aacb > --- /dev/null > +++ b/include/linux/pvsched.h > @@ -0,0 +1,102 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +/* Copyright (c) 2024 Google */ > + > +#ifndef _LINUX_PVSCHED_H > +#define _LINUX_PVSCHED_H 1 > + > +/* > + * List of events for which hypervisor calls back into pvsched driver. > + * Driver can specify the events it is interested in. > + */ > +enum pvsched_vcpu_events { > + PVSCHED_VCPU_VMENTER = 0x1, > + PVSCHED_VCPU_VMEXIT = 0x2, > + PVSCHED_VCPU_HALT = 0x4, > + PVSCHED_VCPU_INTR_INJ = 0x8, > +}; > + > +#define PVSCHED_NAME_MAX 32 > +#define PVSCHED_MAX 8 > +#define PVSCHED_DRV_BUF_MAX (PVSCHED_NAME_MAX * PVSCHED_MAX + PVSCHED_MAX) > + > +/* > + * pvsched driver callbacks. > + * TODO: versioning support for better compatibility with the guest > + * component implementing this feature. > + */ > +struct pvsched_vcpu_ops { > + /* > + * pvsched_vcpu_register() - Register the vcpu with pvsched driver. > + * @pid: pid of the vcpu task. > + * > + * pvsched driver can store the pid internally and initialize > + * itself to prepare for receiving callbacks from thsi vcpu. > + */ > + int (*pvsched_vcpu_register)(struct pid *pid); > + > + /* > + * pvsched_vcpu_unregister() - Un-register the vcpu with pvsched driver. > + * @pid: pid of the vcpu task. > + */ > + void (*pvsched_vcpu_unregister)(struct pid *pid); > + > + /* > + * pvsched_vcpu_notify_event() - Callback for pvsched events > + * @addr: Address of the memory region shared with guest > + * @pid: pid of the vcpu task. > + * @events: bit mask of the events that hypervisor wants to notify. > + */ > + void (*pvsched_vcpu_notify_event)(void *addr, struct pid *pid, u32 event); > + > + char name[PVSCHED_NAME_MAX]; > + struct module *owner; > + struct list_head list; > + u32 events; > + u32 key; > +}; > + > +#ifdef CONFIG_PARAVIRT_SCHED_HOST > +int pvsched_get_available_drivers(char *buf, size_t maxlen); > + > +int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops); > +void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops); > + > +struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name); > +void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops); > + > +static inline int pvsched_validate_vcpu_ops(struct pvsched_vcpu_ops *ops) > +{ > + /* > + * All callbacks are mandatory. > + */ > + if (!ops->pvsched_vcpu_register || !ops->pvsched_vcpu_unregister || > + !ops->pvsched_vcpu_notify_event) > + return -EINVAL; > + > + return 0; > +} > +#else > +static inline void pvsched_get_available_drivers(char *buf, size_t maxlen) > +{ > +} > + > +static inline int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops) > +{ > + return -ENOTSUPP; > +} > + > +static inline void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops) > +{ > +} > + > +static inline struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name) > +{ > + return NULL; > +} > + > +static inline void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops) > +{ > +} > +#endif > + > +#endif > diff --git a/kernel/sysctl.c b/kernel/sysctl.c > index 157f7ce2942d..10a18a791b4f 100644 > --- a/kernel/sysctl.c > +++ b/kernel/sysctl.c > @@ -63,6 +63,7 @@ > #include <linux/mount.h> > #include <linux/userfaultfd_k.h> > #include <linux/pid.h> > +#include <linux/pvsched.h> > > #include "../lib/kstrtox.h" > > @@ -1615,6 +1616,24 @@ int proc_do_static_key(struct ctl_table *table, int write, > return ret; > } > > +#ifdef CONFIG_PARAVIRT_SCHED_HOST > +static int proc_pvsched_available_drivers(struct ctl_table *ctl, > + int write, void *buffer, > + size_t *lenp, loff_t *ppos) > +{ > + struct ctl_table tbl = { .maxlen = PVSCHED_DRV_BUF_MAX, }; > + int ret; > + > + tbl.data = kmalloc(tbl.maxlen, GFP_USER); > + if (!tbl.data) > + return -ENOMEM; > + pvsched_get_available_drivers(tbl.data, PVSCHED_DRV_BUF_MAX); > + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); > + kfree(tbl.data); > + return ret; > +} > +#endif > + > static struct ctl_table kern_table[] = { > { > .procname = "panic", > @@ -2033,6 +2052,14 @@ static struct ctl_table kern_table[] = { > .extra1 = SYSCTL_ONE, > .extra2 = SYSCTL_INT_MAX, > }, > +#endif > +#ifdef CONFIG_PARAVIRT_SCHED_HOST > + { > + .procname = "pvsched_available_drivers", > + .maxlen = PVSCHED_DRV_BUF_MAX, > + .mode = 0444, > + .proc_handler = proc_pvsched_available_drivers, > + }, > #endif > { } > }; > diff --git a/virt/Makefile b/virt/Makefile > index 1cfea9436af9..9d0f32d775a1 100644 > --- a/virt/Makefile > +++ b/virt/Makefile > @@ -1,2 +1,2 @@ > # SPDX-License-Identifier: GPL-2.0-only > -obj-y += lib/ > +obj-y += lib/ pvsched/ > diff --git a/virt/pvsched/Kconfig b/virt/pvsched/Kconfig > new file mode 100644 > index 000000000000..5ca2669060cb > --- /dev/null > +++ b/virt/pvsched/Kconfig > @@ -0,0 +1,12 @@ > +# SPDX-License-Identifier: GPL-2.0-only > +config PARAVIRT_SCHED_HOST > + bool "Paravirt scheduling framework in the host kernel" > + default n > + help > + Paravirtualized scheduling facilitates the exchange of scheduling > + related information between the host and guest through shared memory, > + enhancing the efficiency of vCPU thread scheduling by the hypervisor. > + An illustrative use case involves dynamically boosting the priority of > + a vCPU thread when the guest is executing a latency-sensitive workload > + on that specific vCPU. > + This config enables paravirt scheduling framework in the host kernel. > diff --git a/virt/pvsched/Makefile b/virt/pvsched/Makefile > new file mode 100644 > index 000000000000..4ca38e30479b > --- /dev/null > +++ b/virt/pvsched/Makefile > @@ -0,0 +1,2 @@ > + > +obj-$(CONFIG_PARAVIRT_SCHED_HOST) += pvsched.o > diff --git a/virt/pvsched/pvsched.c b/virt/pvsched/pvsched.c > new file mode 100644 > index 000000000000..610c85cf90d2 > --- /dev/null > +++ b/virt/pvsched/pvsched.c > @@ -0,0 +1,215 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* Copyright (c) 2024 Google */ > + > +/* > + * Paravirt scheduling framework > + * > + */ > + > +/* > + * Heavily inspired from tcp congestion avoidance implementation. > + * (net/ipv4/tcp_cong.c) > + */ > + > +#define pr_fmt(fmt) "PVSCHED: " fmt > + > +#include <linux/module.h> > +#include <linux/bpf.h> > +#include <linux/gfp.h> > +#include <linux/types.h> > +#include <linux/list.h> > +#include <linux/jhash.h> > +#include <linux/pvsched.h> > + > +static DEFINE_SPINLOCK(pvsched_drv_list_lock); > +static int nr_pvsched_drivers = 0; > +static LIST_HEAD(pvsched_drv_list); > + > +/* > + * Retrieve pvsched_vcpu_ops given the name. > + */ > +static struct pvsched_vcpu_ops *pvsched_find_vcpu_ops_name(char *name) > +{ > + struct pvsched_vcpu_ops *ops; > + > + list_for_each_entry_rcu(ops, &pvsched_drv_list, list) { > + if (strcmp(ops->name, name) == 0) > + return ops; > + } > + > + return NULL; > +} > + > +/* > + * Retrieve pvsched_vcpu_ops given the hash key. > + */ > +static struct pvsched_vcpu_ops *pvsched_find_vcpu_ops_key(u32 key) > +{ > + struct pvsched_vcpu_ops *ops; > + > + list_for_each_entry_rcu(ops, &pvsched_drv_list, list) { > + if (ops->key == key) > + return ops; > + } > + > + return NULL; > +} > + > +/* > + * pvsched_get_available_drivers() - Copy space separated list of pvsched > + * driver names. > + * @buf: buffer to store the list of driver names > + * @maxlen: size of the buffer > + * > + * Return: 0 on success, negative value on error. > + */ > +int pvsched_get_available_drivers(char *buf, size_t maxlen) > +{ > + struct pvsched_vcpu_ops *ops; > + size_t offs = 0; > + > + if (!buf) > + return -EINVAL; > + > + if (maxlen > PVSCHED_DRV_BUF_MAX) > + maxlen = PVSCHED_DRV_BUF_MAX; > + > + rcu_read_lock(); > + list_for_each_entry_rcu(ops, &pvsched_drv_list, list) { > + offs += snprintf(buf + offs, maxlen - offs, > + "%s%s", > + offs == 0 ? "" : " ", ops->name); > + > + if (WARN_ON_ONCE(offs >= maxlen)) > + break; > + } > + rcu_read_unlock(); > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(pvsched_get_available_drivers); > + > +/* > + * pvsched_register_vcpu_ops() - Register the driver in the kernel. > + * @ops: Driver data(callbacks) > + * > + * After the registration, driver will be exposed to the hypervisor > + * for assignment to the guest VMs. > + * > + * Return: 0 on success, negative value on error. > + */ > +int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops) > +{ > + int ret = 0; > + > + ops->key = jhash(ops->name, sizeof(ops->name), strlen(ops->name)); > + spin_lock(&pvsched_drv_list_lock); > + if (nr_pvsched_drivers > PVSCHED_MAX) { > + ret = -ENOSPC; > + } if (pvsched_find_vcpu_ops_key(ops->key)) { > + ret = -EEXIST; > + } else if (!(ret = pvsched_validate_vcpu_ops(ops))) { > + list_add_tail_rcu(&ops->list, &pvsched_drv_list); > + nr_pvsched_drivers++; > + } > + spin_unlock(&pvsched_drv_list_lock); > + > + return ret; > +} > +EXPORT_SYMBOL_GPL(pvsched_register_vcpu_ops); > + > +/* > + * pvsched_register_vcpu_ops() - Un-register the driver from the kernel. > + * @ops: Driver data(callbacks) > + * > + * After un-registration, driver will not be visible to hypervisor. > + */ > +void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops) > +{ > + spin_lock(&pvsched_drv_list_lock); > + list_del_rcu(&ops->list); > + nr_pvsched_drivers--; > + spin_unlock(&pvsched_drv_list_lock); > + > + synchronize_rcu(); > +} > +EXPORT_SYMBOL_GPL(pvsched_unregister_vcpu_ops); > + > +/* > + * pvsched_get_vcpu_ops: Acquire the driver. > + * @name: Name of the driver to be acquired. > + * > + * Hypervisor can use this API to get the driver structure for > + * assigning it to guest VMs. This API takes a reference on the > + * module/bpf program so that driver doesn't vanish under the > + * hypervisor. > + * > + * Return: driver structure if found, else NULL. > + */ > +struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name) > +{ > + struct pvsched_vcpu_ops *ops; > + > + if (!name || (strlen(name) >= PVSCHED_NAME_MAX)) > + return NULL; > + > + rcu_read_lock(); > + ops = pvsched_find_vcpu_ops_name(name); > + if (!ops) > + goto out; > + > + if (unlikely(!bpf_try_module_get(ops, ops->owner))) { > + ops = NULL; > + goto out; > + } > + > +out: > + rcu_read_unlock(); > + return ops; > +} > +EXPORT_SYMBOL_GPL(pvsched_get_vcpu_ops); > + > +/* > + * pvsched_put_vcpu_ops: Release the driver. > + * @name: Name of the driver to be releases. > + * > + * Hypervisor can use this API to release the driver. > + */ > +void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops) > +{ > + bpf_module_put(ops, ops->owner); > +} > +EXPORT_SYMBOL_GPL(pvsched_put_vcpu_ops); > + > +/* > + * NOP vm_ops Sample implementation. > + * This driver doesn't do anything other than registering itself. > + * Placeholder for adding some default logic when the feature is > + * complete. > + */ > +static int nop_pvsched_vcpu_register(struct pid *pid) > +{ > + return 0; > +} > +static void nop_pvsched_vcpu_unregister(struct pid *pid) > +{ > +} > +static void nop_pvsched_notify_event(void *addr, struct pid *pid, u32 event) > +{ > +} > + > +struct pvsched_vcpu_ops nop_vcpu_ops = { > + .events = PVSCHED_VCPU_VMENTER | PVSCHED_VCPU_VMEXIT | PVSCHED_VCPU_HALT, > + .pvsched_vcpu_register = nop_pvsched_vcpu_register, > + .pvsched_vcpu_unregister = nop_pvsched_vcpu_unregister, > + .pvsched_vcpu_notify_event = nop_pvsched_notify_event, > + .name = "pvsched_nop", > + .owner = THIS_MODULE, > +}; > + > +static int __init pvsched_init(void) > +{ > + return WARN_ON(pvsched_register_vcpu_ops(&nop_vcpu_ops)); > +} > + > +late_initcall(pvsched_init); > -- > 2.40.1 >