This is basically the same code adopted from KVM. The main user case is the future MSHV-VFIO bridge device. We don't have any plan to support in-kernel device emulation yet, but it wouldn't hurt to make the code more flexible. Signed-off-by: Wei Liu <wei.liu@xxxxxxxxxx> --- Documentation/virt/mshv/api.rst | 12 +++ drivers/hv/mshv_main.c | 181 ++++++++++++++++++++++++++++++++ include/linux/mshv.h | 57 ++++++++++ include/uapi/linux/mshv.h | 36 +++++++ 4 files changed, 286 insertions(+) diff --git a/Documentation/virt/mshv/api.rst b/Documentation/virt/mshv/api.rst index 56a6edfcfe29..7d35dd589831 100644 --- a/Documentation/virt/mshv/api.rst +++ b/Documentation/virt/mshv/api.rst @@ -170,4 +170,16 @@ Can be used to get/set various properties of a partition. Some properties can only be set at partition creation. For these, there are parameters in MSHV_CREATE_PARTITION. +3.12 MSHV_CREATE_DEVICE +----------------------- +:Type: partition ioctl +:Parameters: struct mshv_create_device +:Returns: 0 on success + +Can be used to create an in-kernel device. + +If the MSHV_CREATE_DEVICE_TEST flag is set, only test whether the +device type is supported (not necessarily whether it can be created +in the current vm). +Currently only supports VFIO type device. diff --git a/drivers/hv/mshv_main.c b/drivers/hv/mshv_main.c index 4cbc520471e4..84c774a561de 100644 --- a/drivers/hv/mshv_main.c +++ b/drivers/hv/mshv_main.c @@ -20,6 +20,8 @@ #include <linux/random.h> #include <linux/mshv.h> #include <linux/mshv_eventfd.h> +#include <linux/hyperv.h> +#include <linux/nospec.h> #include <asm/mshyperv.h> #include "mshv.h" @@ -33,6 +35,7 @@ static int mshv_vp_release(struct inode *inode, struct file *filp); static long mshv_vp_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static struct mshv_partition *mshv_partition_get(struct mshv_partition *partition); static void mshv_partition_put(struct mshv_partition *partition); +static void mshv_partition_put_no_destroy(struct mshv_partition *partition); static int mshv_partition_release(struct inode *inode, struct file *filp); static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); static int mshv_dev_open(struct inode *inode, struct file *filp); @@ -912,6 +915,172 @@ mshv_partition_ioctl_set_msi_routing(struct mshv_partition *partition, return ret; } +static int mshv_device_ioctl_attr(struct mshv_device *dev, + int (*accessor)(struct mshv_device *dev, + struct mshv_device_attr *attr), + unsigned long arg) +{ + struct mshv_device_attr attr; + + if (!accessor) + return -EPERM; + + if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) + return -EFAULT; + + return accessor(dev, &attr); +} + +static long mshv_device_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + struct mshv_device *dev = filp->private_data; + + switch (ioctl) { + case MSHV_SET_DEVICE_ATTR: + return mshv_device_ioctl_attr(dev, dev->ops->set_attr, arg); + case MSHV_GET_DEVICE_ATTR: + return mshv_device_ioctl_attr(dev, dev->ops->get_attr, arg); + case MSHV_HAS_DEVICE_ATTR: + return mshv_device_ioctl_attr(dev, dev->ops->has_attr, arg); + default: + if (dev->ops->ioctl) + return dev->ops->ioctl(dev, ioctl, arg); + + return -ENOTTY; + } +} + +static int mshv_device_release(struct inode *inode, struct file *filp) +{ + struct mshv_device *dev = filp->private_data; + struct mshv_partition *partition = dev->partition; + + if (dev->ops->release) { + mutex_lock(&partition->mutex); + list_del(&dev->partition_node); + dev->ops->release(dev); + mutex_unlock(&partition->mutex); + } + + mshv_partition_put(partition); + return 0; +} + +static const struct file_operations mshv_device_fops = { + .unlocked_ioctl = mshv_device_ioctl, + .release = mshv_device_release, +}; + +static const struct mshv_device_ops *mshv_device_ops_table[MSHV_DEV_TYPE_MAX]; + +int mshv_register_device_ops(const struct mshv_device_ops *ops, u32 type) +{ + if (type >= ARRAY_SIZE(mshv_device_ops_table)) + return -ENOSPC; + + if (mshv_device_ops_table[type] != NULL) + return -EEXIST; + + mshv_device_ops_table[type] = ops; + return 0; +} + +void mshv_unregister_device_ops(u32 type) +{ + if (type >= ARRAY_SIZE(mshv_device_ops_table)) + return; + mshv_device_ops_table[type] = NULL; +} + +static long +mshv_partition_ioctl_create_device(struct mshv_partition *partition, + void __user *user_args) +{ + long r; + struct mshv_create_device tmp, *cd; + struct mshv_device *dev; + const struct mshv_device_ops *ops; + int type; + + if (copy_from_user(&tmp, user_args, sizeof(tmp))) { + r = -EFAULT; + goto out; + } + + cd = &tmp; + + if (cd->type >= ARRAY_SIZE(mshv_device_ops_table)) { + r = -ENODEV; + goto out; + } + + type = array_index_nospec(cd->type, ARRAY_SIZE(mshv_device_ops_table)); + ops = mshv_device_ops_table[type]; + if (ops == NULL) { + r = -ENODEV; + goto out; + } + + if (cd->flags & MSHV_CREATE_DEVICE_TEST) { + r = 0; + goto out; + } + + dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); + if (!dev) { + r = -ENOMEM; + goto out; + } + + dev->ops = ops; + dev->partition = partition; + + r = ops->create(dev, type); + if (r < 0) { + kfree(dev); + goto out; + } + + list_add(&dev->partition_node, &partition->devices); + + if (ops->init) + ops->init(dev); + + mshv_partition_get(partition); + r = anon_inode_getfd(ops->name, &mshv_device_fops, dev, O_RDWR | O_CLOEXEC); + if (r < 0) { + mshv_partition_put_no_destroy(partition); + list_del(&dev->partition_node); + ops->destroy(dev); + goto out; + } + + cd->fd = r; + r = 0; + + if (copy_to_user(user_args, &tmp, sizeof(tmp))) { + r = -EFAULT; + goto out; + } +out: + return r; +} + +static void mshv_destroy_devices(struct mshv_partition *partition) +{ + struct mshv_device *dev, *tmp; + + /* + * No need to take any lock since at this point nobody else can + * reference this partition. + */ + list_for_each_entry_safe(dev, tmp, &partition->devices, partition_node) { + list_del(&dev->partition_node); + dev->ops->destroy(dev); + } +} + static long mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -965,6 +1134,9 @@ mshv_partition_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) case MSHV_GET_GPA_ACCESS_STATES: ret = mshv_partition_ioctl_get_gpa_access_state(partition, (void __user *)arg); + case MSHV_CREATE_DEVICE: + ret = mshv_partition_ioctl_create_device(partition, + (void __user *)arg); break; default: ret = -ENOTTY; @@ -1033,6 +1205,7 @@ destroy_partition(struct mshv_partition *partition) vfree(region->pages); } + mshv_destroy_devices(partition); mshv_free_msi_routing(partition); kfree(partition); } @@ -1052,6 +1225,12 @@ mshv_partition_put(struct mshv_partition *partition) destroy_partition(partition); } +static void +mshv_partition_put_no_destroy(struct mshv_partition *partition) +{ + WARN_ON(refcount_dec_and_test(&partition->ref_count)); +} + static int mshv_partition_release(struct inode *inode, struct file *filp) { @@ -1122,6 +1301,8 @@ mshv_ioctl_create_partition(void __user *user_arg) INIT_HLIST_HEAD(&partition->irq_ack_notifier_list); + INIT_LIST_HEAD(&partition->devices); + fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) { ret = fd; diff --git a/include/linux/mshv.h b/include/linux/mshv.h index fc655b60c5cd..c557ffeec90c 100644 --- a/include/linux/mshv.h +++ b/include/linux/mshv.h @@ -59,6 +59,8 @@ struct mshv_partition { struct srcu_struct irq_srcu; struct hlist_head irq_ack_notifier_list; + struct list_head devices; + struct { spinlock_t lock; struct list_head items; @@ -121,4 +123,59 @@ struct mshv { } partitions; }; +struct mshv_device { + const struct mshv_device_ops *ops; + struct mshv_partition *partition; + void *private; + struct list_head partition_node; + +}; + +/* create, destroy, and name are mandatory */ +struct mshv_device_ops { + const char *name; + + /* + * create is called holding partition->mutex and any operations not suitable + * to do while holding the lock should be deferred to init (see + * below). + */ + int (*create)(struct mshv_device *dev, u32 type); + + /* + * init is called after create if create is successful and is called + * outside of holding partition->mutex. + */ + void (*init)(struct mshv_device *dev); + + /* + * Destroy is responsible for freeing dev. + * + * Destroy may be called before or after destructors are called + * on emulated I/O regions, depending on whether a reference is + * held by a vcpu or other mshv component that gets destroyed + * after the emulated I/O. + */ + void (*destroy)(struct mshv_device *dev); + + /* + * Release is an alternative method to free the device. It is + * called when the device file descriptor is closed. Once + * release is called, the destroy method will not be called + * anymore as the device is removed from the device list of + * the VM. partition->mutex is held. + */ + void (*release)(struct mshv_device *dev); + + int (*set_attr)(struct mshv_device *dev, struct mshv_device_attr *attr); + int (*get_attr)(struct mshv_device *dev, struct mshv_device_attr *attr); + int (*has_attr)(struct mshv_device *dev, struct mshv_device_attr *attr); + long (*ioctl)(struct mshv_device *dev, unsigned int ioctl, + unsigned long arg); + int (*mmap)(struct mshv_device *dev, struct vm_area_struct *vma); +}; + +int mshv_register_device_ops(const struct mshv_device_ops *ops, u32 type); +void mshv_unregister_device_ops(u32 type); + #endif diff --git a/include/uapi/linux/mshv.h b/include/uapi/linux/mshv.h index cba318ee7cf5..ed110109492f 100644 --- a/include/uapi/linux/mshv.h +++ b/include/uapi/linux/mshv.h @@ -158,6 +158,14 @@ struct mshv_msi_routing { #define MSHV_SET_VP_STATE _IOWR(MSHV_IOCTL, 0x0B, struct mshv_vp_state) #define MSHV_TRANSLATE_GVA _IOWR(MSHV_IOCTL, 0x0E, struct mshv_translate_gva) +/* ioctl for device fd */ +#define MSHV_CREATE_DEVICE _IOWR(MSHV_IOCTL, 0x13, struct mshv_create_device) + +/* ioctls for fds returned by MSHV_CREATE_DEVICE */ +#define MSHV_SET_DEVICE_ATTR _IOW(MSHV_IOCTL, 0x14, struct mshv_device_attr) +#define MSHV_GET_DEVICE_ATTR _IOW(MSHV_IOCTL, 0x15, struct mshv_device_attr) +#define MSHV_HAS_DEVICE_ATTR _IOW(MSHV_IOCTL, 0x16, struct mshv_device_attr) + /* register page mapping example: * struct hv_vp_register_page *regs = mmap(NULL, * 4096, @@ -184,4 +192,32 @@ union mshv_partition_property_page_access_tracking_config { __u64 as_uint64; } __packed; +/* + * Device control API. + */ +#define MSHV_CREATE_DEVICE_TEST 1 + +struct mshv_create_device { + __u32 type; /* in: MSHV_DEV_TYPE_xxx */ + __u32 fd; /* out: device handle */ + __u32 flags; /* in: MSHV_CREATE_DEVICE_xxx */ +}; + +#define MSHV_DEV_VFIO_GROUP 1 +#define MSHV_DEV_VFIO_GROUP_ADD 1 +#define MSHV_DEV_VFIO_GROUP_DEL 2 + +enum mshv_device_type { + MSHV_DEV_TYPE_VFIO, +#define MSHV_DEV_TYPE_VFIO MSHV_DEV_TYPE_VFIO + MSHV_DEV_TYPE_MAX, +}; + +struct mshv_device_attr { + __u32 flags; /* no flags currently defined */ + __u32 group; /* device-defined */ + __u64 attr; /* group-defined */ + __u64 addr; /* userspace address of attr data */ +}; + #endif -- 2.30.2