On Thu, 14 Jul 2022 11:12:46 +0300 Yishai Hadas <yishaih@xxxxxxxxxx> wrote: > Introduce the DMA logging feature support in the vfio core layer. > > It includes the processing of the device start/stop/report DMA logging > UAPIs and calling the relevant driver 'op' to do the work. > > Specifically, > Upon start, the core translates the given input ranges into an interval > tree, checks for unexpected overlapping, non aligned ranges and then > pass the translated input to the driver for start tracking the given > ranges. > > Upon report, the core translates the given input user space bitmap and > page size into an IOVA kernel bitmap iterator. Then it iterates it and > call the driver to set the corresponding bits for the dirtied pages in a > specific IOVA range. > > Upon stop, the driver is called to stop the previous started tracking. > > The next patches from the series will introduce the mlx5 driver > implementation for the logging ops. > > Signed-off-by: Yishai Hadas <yishaih@xxxxxxxxxx> > --- > drivers/vfio/Kconfig | 1 + > drivers/vfio/pci/vfio_pci_core.c | 5 + > drivers/vfio/vfio_main.c | 161 +++++++++++++++++++++++++++++++ > include/linux/vfio.h | 21 +++- > 4 files changed, 186 insertions(+), 2 deletions(-) > > diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig > index 6130d00252ed..86c381ceb9a1 100644 > --- a/drivers/vfio/Kconfig > +++ b/drivers/vfio/Kconfig > @@ -3,6 +3,7 @@ menuconfig VFIO > tristate "VFIO Non-Privileged userspace driver framework" > select IOMMU_API > select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) > + select INTERVAL_TREE > help > VFIO provides a framework for secure userspace device drivers. > See Documentation/driver-api/vfio.rst for more details. > diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c > index 2efa06b1fafa..b6dabf398251 100644 > --- a/drivers/vfio/pci/vfio_pci_core.c > +++ b/drivers/vfio/pci/vfio_pci_core.c > @@ -1862,6 +1862,11 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) > return -EINVAL; > } > > + if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start && > + vdev->vdev.log_ops->log_stop && > + vdev->vdev.log_ops->log_read_and_clear)) > + return -EINVAL; > + > /* > * Prevent binding to PFs with VFs enabled, the VFs might be in use > * by the host or other users. We cannot capture the VFs if they > diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c > index bd84ca7c5e35..2414d827e3c8 100644 > --- a/drivers/vfio/vfio_main.c > +++ b/drivers/vfio/vfio_main.c > @@ -32,6 +32,8 @@ > #include <linux/vfio.h> > #include <linux/wait.h> > #include <linux/sched/signal.h> > +#include <linux/interval_tree.h> > +#include <linux/iova_bitmap.h> > #include "vfio.h" > > #define DRIVER_VERSION "0.3" > @@ -1603,6 +1605,153 @@ static int vfio_ioctl_device_feature_migration(struct vfio_device *device, > return 0; > } > > +#define LOG_MAX_RANGES 1024 > + > +static int > +vfio_ioctl_device_feature_logging_start(struct vfio_device *device, > + u32 flags, void __user *arg, > + size_t argsz) > +{ > + size_t minsz = > + offsetofend(struct vfio_device_feature_dma_logging_control, > + ranges); > + struct vfio_device_feature_dma_logging_range __user *ranges; > + struct vfio_device_feature_dma_logging_control control; > + struct vfio_device_feature_dma_logging_range range; > + struct rb_root_cached root = RB_ROOT_CACHED; > + struct interval_tree_node *nodes; > + u32 nnodes; > + int i, ret; > + > + if (!device->log_ops) > + return -ENOTTY; > + > + ret = vfio_check_feature(flags, argsz, > + VFIO_DEVICE_FEATURE_SET, > + sizeof(control)); > + if (ret != 1) > + return ret; > + > + if (copy_from_user(&control, arg, minsz)) > + return -EFAULT; > + > + nnodes = control.num_ranges; > + if (!nnodes || nnodes > LOG_MAX_RANGES) > + return -EINVAL; The latter looks more like an -E2BIG errno. This is a hard coded limit, but what are the heuristics? Can a user introspect the limit? Thanks, Alex > + > + ranges = u64_to_user_ptr(control.ranges); > + nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node), > + GFP_KERNEL); > + if (!nodes) > + return -ENOMEM; > + > + for (i = 0; i < nnodes; i++) { > + if (copy_from_user(&range, &ranges[i], sizeof(range))) { > + ret = -EFAULT; > + goto end; > + } > + if (!IS_ALIGNED(range.iova, control.page_size) || > + !IS_ALIGNED(range.length, control.page_size)) { > + ret = -EINVAL; > + goto end; > + } > + nodes[i].start = range.iova; > + nodes[i].last = range.iova + range.length - 1; > + if (interval_tree_iter_first(&root, nodes[i].start, > + nodes[i].last)) { > + /* Range overlapping */ > + ret = -EINVAL; > + goto end; > + } > + interval_tree_insert(nodes + i, &root); > + } > + > + ret = device->log_ops->log_start(device, &root, nnodes, > + &control.page_size); > + if (ret) > + goto end; > + > + if (copy_to_user(arg, &control, sizeof(control))) { > + ret = -EFAULT; > + device->log_ops->log_stop(device); > + } > + > +end: > + kfree(nodes); > + return ret; > +} > + > +static int > +vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, > + u32 flags, void __user *arg, > + size_t argsz) > +{ > + int ret; > + > + if (!device->log_ops) > + return -ENOTTY; > + > + ret = vfio_check_feature(flags, argsz, > + VFIO_DEVICE_FEATURE_SET, 0); > + if (ret != 1) > + return ret; > + > + return device->log_ops->log_stop(device); > +} > + > +static int > +vfio_ioctl_device_feature_logging_report(struct vfio_device *device, > + u32 flags, void __user *arg, > + size_t argsz) > +{ > + size_t minsz = > + offsetofend(struct vfio_device_feature_dma_logging_report, > + bitmap); > + struct vfio_device_feature_dma_logging_report report; > + struct iova_bitmap_iter iter; > + int ret; > + > + if (!device->log_ops) > + return -ENOTTY; > + > + ret = vfio_check_feature(flags, argsz, > + VFIO_DEVICE_FEATURE_GET, > + sizeof(report)); > + if (ret != 1) > + return ret; > + > + if (copy_from_user(&report, arg, minsz)) > + return -EFAULT; > + > + if (report.page_size < PAGE_SIZE) > + return -EINVAL; > + > + iova_bitmap_init(&iter.dirty, report.iova, ilog2(report.page_size)); > + ret = iova_bitmap_iter_init(&iter, report.iova, report.length, > + u64_to_user_ptr(report.bitmap)); > + if (ret) > + return ret; > + > + for (; !iova_bitmap_iter_done(&iter); > + iova_bitmap_iter_advance(&iter)) { > + ret = iova_bitmap_iter_get(&iter); > + if (ret) > + break; > + > + ret = device->log_ops->log_read_and_clear(device, > + iova_bitmap_iova(&iter), > + iova_bitmap_length(&iter), &iter.dirty); > + > + iova_bitmap_iter_put(&iter); > + > + if (ret) > + break; > + } > + > + iova_bitmap_iter_free(&iter); > + return ret; > +} > + > static int vfio_ioctl_device_feature(struct vfio_device *device, > struct vfio_device_feature __user *arg) > { > @@ -1636,6 +1785,18 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, > return vfio_ioctl_device_feature_mig_device_state( > device, feature.flags, arg->data, > feature.argsz - minsz); > + case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: > + return vfio_ioctl_device_feature_logging_start( > + device, feature.flags, arg->data, > + feature.argsz - minsz); > + case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: > + return vfio_ioctl_device_feature_logging_stop( > + device, feature.flags, arg->data, > + feature.argsz - minsz); > + case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: > + return vfio_ioctl_device_feature_logging_report( > + device, feature.flags, arg->data, > + feature.argsz - minsz); > default: > if (unlikely(!device->ops->device_feature)) > return -EINVAL; > diff --git a/include/linux/vfio.h b/include/linux/vfio.h > index 4d26e149db81..feed84d686ec 100644 > --- a/include/linux/vfio.h > +++ b/include/linux/vfio.h > @@ -14,6 +14,7 @@ > #include <linux/workqueue.h> > #include <linux/poll.h> > #include <uapi/linux/vfio.h> > +#include <linux/iova_bitmap.h> > > struct kvm; > > @@ -33,10 +34,11 @@ struct vfio_device { > struct device *dev; > const struct vfio_device_ops *ops; > /* > - * mig_ops is a static property of the vfio_device which must be set > - * prior to registering the vfio_device. > + * mig_ops/log_ops is a static property of the vfio_device which must > + * be set prior to registering the vfio_device. > */ > const struct vfio_migration_ops *mig_ops; > + const struct vfio_log_ops *log_ops; > struct vfio_group *group; > struct vfio_device_set *dev_set; > struct list_head dev_set_list; > @@ -104,6 +106,21 @@ struct vfio_migration_ops { > enum vfio_device_mig_state *curr_state); > }; > > +/** > + * @log_start: Optional callback to ask the device start DMA logging. > + * @log_stop: Optional callback to ask the device stop DMA logging. > + * @log_read_and_clear: Optional callback to ask the device read > + * and clear the dirty DMAs in some given range. > + */ > +struct vfio_log_ops { > + int (*log_start)(struct vfio_device *device, > + struct rb_root_cached *ranges, u32 nnodes, u64 *page_size); > + int (*log_stop)(struct vfio_device *device); > + int (*log_read_and_clear)(struct vfio_device *device, > + unsigned long iova, unsigned long length, > + struct iova_bitmap *dirty); > +}; > + > /** > * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl > * @flags: Arg from the device_feature op