Implement a paravirt scheduling framework for linux kernel. The framework allows for pvsched driver to register to the kernel and receive callbacks from hypervisor(eg: kvm) for interested vcpu events like VMENTER, VMEXIT etc. The framework also allows hypervisor to select a pvsched driver (from the available list of registered drivers) for each guest. Also implement a sysctl for listing the available pvsched drivers. Signed-off-by: Vineeth Pillai (Google) <vineeth@xxxxxxxxxxxxxxx> Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx> --- Kconfig | 2 + include/linux/pvsched.h | 102 +++++++++++++++++++ kernel/sysctl.c | 27 +++++ virt/Makefile | 2 +- virt/pvsched/Kconfig | 12 +++ virt/pvsched/Makefile | 2 + virt/pvsched/pvsched.c | 215 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 include/linux/pvsched.h create mode 100644 virt/pvsched/Kconfig create mode 100644 virt/pvsched/Makefile create mode 100644 virt/pvsched/pvsched.c diff --git a/Kconfig b/Kconfig index 745bc773f567..4a52eaa21166 100644 --- a/Kconfig +++ b/Kconfig @@ -29,4 +29,6 @@ source "lib/Kconfig" source "lib/Kconfig.debug" +source "virt/pvsched/Kconfig" + source "Documentation/Kconfig" diff --git a/include/linux/pvsched.h b/include/linux/pvsched.h new file mode 100644 index 000000000000..59df6b44aacb --- /dev/null +++ b/include/linux/pvsched.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2024 Google */ + +#ifndef _LINUX_PVSCHED_H +#define _LINUX_PVSCHED_H 1 + +/* + * List of events for which hypervisor calls back into pvsched driver. + * Driver can specify the events it is interested in. + */ +enum pvsched_vcpu_events { + PVSCHED_VCPU_VMENTER = 0x1, + PVSCHED_VCPU_VMEXIT = 0x2, + PVSCHED_VCPU_HALT = 0x4, + PVSCHED_VCPU_INTR_INJ = 0x8, +}; + +#define PVSCHED_NAME_MAX 32 +#define PVSCHED_MAX 8 +#define PVSCHED_DRV_BUF_MAX (PVSCHED_NAME_MAX * PVSCHED_MAX + PVSCHED_MAX) + +/* + * pvsched driver callbacks. + * TODO: versioning support for better compatibility with the guest + * component implementing this feature. + */ +struct pvsched_vcpu_ops { + /* + * pvsched_vcpu_register() - Register the vcpu with pvsched driver. + * @pid: pid of the vcpu task. + * + * pvsched driver can store the pid internally and initialize + * itself to prepare for receiving callbacks from thsi vcpu. + */ + int (*pvsched_vcpu_register)(struct pid *pid); + + /* + * pvsched_vcpu_unregister() - Un-register the vcpu with pvsched driver. + * @pid: pid of the vcpu task. + */ + void (*pvsched_vcpu_unregister)(struct pid *pid); + + /* + * pvsched_vcpu_notify_event() - Callback for pvsched events + * @addr: Address of the memory region shared with guest + * @pid: pid of the vcpu task. + * @events: bit mask of the events that hypervisor wants to notify. + */ + void (*pvsched_vcpu_notify_event)(void *addr, struct pid *pid, u32 event); + + char name[PVSCHED_NAME_MAX]; + struct module *owner; + struct list_head list; + u32 events; + u32 key; +}; + +#ifdef CONFIG_PARAVIRT_SCHED_HOST +int pvsched_get_available_drivers(char *buf, size_t maxlen); + +int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops); +void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops); + +struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name); +void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops); + +static inline int pvsched_validate_vcpu_ops(struct pvsched_vcpu_ops *ops) +{ + /* + * All callbacks are mandatory. + */ + if (!ops->pvsched_vcpu_register || !ops->pvsched_vcpu_unregister || + !ops->pvsched_vcpu_notify_event) + return -EINVAL; + + return 0; +} +#else +static inline void pvsched_get_available_drivers(char *buf, size_t maxlen) +{ +} + +static inline int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops) +{ + return -ENOTSUPP; +} + +static inline void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops) +{ +} + +static inline struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name) +{ + return NULL; +} + +static inline void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops) +{ +} +#endif + +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 157f7ce2942d..10a18a791b4f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -63,6 +63,7 @@ #include <linux/mount.h> #include <linux/userfaultfd_k.h> #include <linux/pid.h> +#include <linux/pvsched.h> #include "../lib/kstrtox.h" @@ -1615,6 +1616,24 @@ int proc_do_static_key(struct ctl_table *table, int write, return ret; } +#ifdef CONFIG_PARAVIRT_SCHED_HOST +static int proc_pvsched_available_drivers(struct ctl_table *ctl, + int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct ctl_table tbl = { .maxlen = PVSCHED_DRV_BUF_MAX, }; + int ret; + + tbl.data = kmalloc(tbl.maxlen, GFP_USER); + if (!tbl.data) + return -ENOMEM; + pvsched_get_available_drivers(tbl.data, PVSCHED_DRV_BUF_MAX); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + kfree(tbl.data); + return ret; +} +#endif + static struct ctl_table kern_table[] = { { .procname = "panic", @@ -2033,6 +2052,14 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, +#endif +#ifdef CONFIG_PARAVIRT_SCHED_HOST + { + .procname = "pvsched_available_drivers", + .maxlen = PVSCHED_DRV_BUF_MAX, + .mode = 0444, + .proc_handler = proc_pvsched_available_drivers, + }, #endif { } }; diff --git a/virt/Makefile b/virt/Makefile index 1cfea9436af9..9d0f32d775a1 100644 --- a/virt/Makefile +++ b/virt/Makefile @@ -1,2 +1,2 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-y += lib/ +obj-y += lib/ pvsched/ diff --git a/virt/pvsched/Kconfig b/virt/pvsched/Kconfig new file mode 100644 index 000000000000..5ca2669060cb --- /dev/null +++ b/virt/pvsched/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +config PARAVIRT_SCHED_HOST + bool "Paravirt scheduling framework in the host kernel" + default n + help + Paravirtualized scheduling facilitates the exchange of scheduling + related information between the host and guest through shared memory, + enhancing the efficiency of vCPU thread scheduling by the hypervisor. + An illustrative use case involves dynamically boosting the priority of + a vCPU thread when the guest is executing a latency-sensitive workload + on that specific vCPU. + This config enables paravirt scheduling framework in the host kernel. diff --git a/virt/pvsched/Makefile b/virt/pvsched/Makefile new file mode 100644 index 000000000000..4ca38e30479b --- /dev/null +++ b/virt/pvsched/Makefile @@ -0,0 +1,2 @@ + +obj-$(CONFIG_PARAVIRT_SCHED_HOST) += pvsched.o diff --git a/virt/pvsched/pvsched.c b/virt/pvsched/pvsched.c new file mode 100644 index 000000000000..610c85cf90d2 --- /dev/null +++ b/virt/pvsched/pvsched.c @@ -0,0 +1,215 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2024 Google */ + +/* + * Paravirt scheduling framework + * + */ + +/* + * Heavily inspired from tcp congestion avoidance implementation. + * (net/ipv4/tcp_cong.c) + */ + +#define pr_fmt(fmt) "PVSCHED: " fmt + +#include <linux/module.h> +#include <linux/bpf.h> +#include <linux/gfp.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/pvsched.h> + +static DEFINE_SPINLOCK(pvsched_drv_list_lock); +static int nr_pvsched_drivers = 0; +static LIST_HEAD(pvsched_drv_list); + +/* + * Retrieve pvsched_vcpu_ops given the name. + */ +static struct pvsched_vcpu_ops *pvsched_find_vcpu_ops_name(char *name) +{ + struct pvsched_vcpu_ops *ops; + + list_for_each_entry_rcu(ops, &pvsched_drv_list, list) { + if (strcmp(ops->name, name) == 0) + return ops; + } + + return NULL; +} + +/* + * Retrieve pvsched_vcpu_ops given the hash key. + */ +static struct pvsched_vcpu_ops *pvsched_find_vcpu_ops_key(u32 key) +{ + struct pvsched_vcpu_ops *ops; + + list_for_each_entry_rcu(ops, &pvsched_drv_list, list) { + if (ops->key == key) + return ops; + } + + return NULL; +} + +/* + * pvsched_get_available_drivers() - Copy space separated list of pvsched + * driver names. + * @buf: buffer to store the list of driver names + * @maxlen: size of the buffer + * + * Return: 0 on success, negative value on error. + */ +int pvsched_get_available_drivers(char *buf, size_t maxlen) +{ + struct pvsched_vcpu_ops *ops; + size_t offs = 0; + + if (!buf) + return -EINVAL; + + if (maxlen > PVSCHED_DRV_BUF_MAX) + maxlen = PVSCHED_DRV_BUF_MAX; + + rcu_read_lock(); + list_for_each_entry_rcu(ops, &pvsched_drv_list, list) { + offs += snprintf(buf + offs, maxlen - offs, + "%s%s", + offs == 0 ? "" : " ", ops->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; + } + rcu_read_unlock(); + + return 0; +} +EXPORT_SYMBOL_GPL(pvsched_get_available_drivers); + +/* + * pvsched_register_vcpu_ops() - Register the driver in the kernel. + * @ops: Driver data(callbacks) + * + * After the registration, driver will be exposed to the hypervisor + * for assignment to the guest VMs. + * + * Return: 0 on success, negative value on error. + */ +int pvsched_register_vcpu_ops(struct pvsched_vcpu_ops *ops) +{ + int ret = 0; + + ops->key = jhash(ops->name, sizeof(ops->name), strlen(ops->name)); + spin_lock(&pvsched_drv_list_lock); + if (nr_pvsched_drivers > PVSCHED_MAX) { + ret = -ENOSPC; + } if (pvsched_find_vcpu_ops_key(ops->key)) { + ret = -EEXIST; + } else if (!(ret = pvsched_validate_vcpu_ops(ops))) { + list_add_tail_rcu(&ops->list, &pvsched_drv_list); + nr_pvsched_drivers++; + } + spin_unlock(&pvsched_drv_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(pvsched_register_vcpu_ops); + +/* + * pvsched_register_vcpu_ops() - Un-register the driver from the kernel. + * @ops: Driver data(callbacks) + * + * After un-registration, driver will not be visible to hypervisor. + */ +void pvsched_unregister_vcpu_ops(struct pvsched_vcpu_ops *ops) +{ + spin_lock(&pvsched_drv_list_lock); + list_del_rcu(&ops->list); + nr_pvsched_drivers--; + spin_unlock(&pvsched_drv_list_lock); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(pvsched_unregister_vcpu_ops); + +/* + * pvsched_get_vcpu_ops: Acquire the driver. + * @name: Name of the driver to be acquired. + * + * Hypervisor can use this API to get the driver structure for + * assigning it to guest VMs. This API takes a reference on the + * module/bpf program so that driver doesn't vanish under the + * hypervisor. + * + * Return: driver structure if found, else NULL. + */ +struct pvsched_vcpu_ops *pvsched_get_vcpu_ops(char *name) +{ + struct pvsched_vcpu_ops *ops; + + if (!name || (strlen(name) >= PVSCHED_NAME_MAX)) + return NULL; + + rcu_read_lock(); + ops = pvsched_find_vcpu_ops_name(name); + if (!ops) + goto out; + + if (unlikely(!bpf_try_module_get(ops, ops->owner))) { + ops = NULL; + goto out; + } + +out: + rcu_read_unlock(); + return ops; +} +EXPORT_SYMBOL_GPL(pvsched_get_vcpu_ops); + +/* + * pvsched_put_vcpu_ops: Release the driver. + * @name: Name of the driver to be releases. + * + * Hypervisor can use this API to release the driver. + */ +void pvsched_put_vcpu_ops(struct pvsched_vcpu_ops *ops) +{ + bpf_module_put(ops, ops->owner); +} +EXPORT_SYMBOL_GPL(pvsched_put_vcpu_ops); + +/* + * NOP vm_ops Sample implementation. + * This driver doesn't do anything other than registering itself. + * Placeholder for adding some default logic when the feature is + * complete. + */ +static int nop_pvsched_vcpu_register(struct pid *pid) +{ + return 0; +} +static void nop_pvsched_vcpu_unregister(struct pid *pid) +{ +} +static void nop_pvsched_notify_event(void *addr, struct pid *pid, u32 event) +{ +} + +struct pvsched_vcpu_ops nop_vcpu_ops = { + .events = PVSCHED_VCPU_VMENTER | PVSCHED_VCPU_VMEXIT | PVSCHED_VCPU_HALT, + .pvsched_vcpu_register = nop_pvsched_vcpu_register, + .pvsched_vcpu_unregister = nop_pvsched_vcpu_unregister, + .pvsched_vcpu_notify_event = nop_pvsched_notify_event, + .name = "pvsched_nop", + .owner = THIS_MODULE, +}; + +static int __init pvsched_init(void) +{ + return WARN_ON(pvsched_register_vcpu_ops(&nop_vcpu_ops)); +} + +late_initcall(pvsched_init); -- 2.40.1