Hi Alex, On 2021/4/16 4:43, Alex Williamson wrote: > On Tue, 13 Apr 2021 17:14:45 +0800 > Keqian Zhu <zhukeqian1@xxxxxxxxxx> wrote: > >> From: Kunkun Jiang <jiangkunkun@xxxxxxxxxx> >> >> In the past, we clear dirty log immediately after sync dirty >> log to userspace. This may cause redundant dirty handling if >> userspace handles dirty log iteratively: >> >> After vfio clears dirty log, new dirty log starts to generate. >> These new dirty log will be reported to userspace even if they >> are generated before userspace handles the same dirty page. >> >> That's to say, we should minimize the time gap of dirty log >> clearing and dirty log handling. We can give userspace the >> interface to clear dirty log. > > IIUC, a user would be expected to clear the bitmap before copying the > dirty pages, therefore you're trying to reduce that time gap between > clearing any copy, but it cannot be fully eliminated and importantly, > if the user clears after copying, they've introduced a race. Correct? Yep, it's totally correct. If user clears after copying, it may lose dirty info. I'll enhance the doc. > > What results do you have to show that this is a worthwhile optimization? This optimization is inspired by KVM[1]. The results are differ by different workload of guest. In theory, the higher dirty rate the better result. But sorry that I tested it on our FPGA, the dirty rate is heavily limited, so the result is not obvious. > > I really don't like the semantics that testing for an IOMMU capability > enables it. It needs to be explicitly controllable feature, which > suggests to me that it might be a flag used in combination with _GET or > a separate _GET_NOCLEAR operations. Thanks, Yes, good suggestion. We should give userspace a choice. Thanks, Keqian [1] https://lore.kernel.org/kvm/1543251253-24762-1-git-send-email-pbonzini@xxxxxxxxxx/ > > Alex > > >> Co-developed-by: Keqian Zhu <zhukeqian1@xxxxxxxxxx> >> Signed-off-by: Kunkun Jiang <jiangkunkun@xxxxxxxxxx> >> --- >> drivers/vfio/vfio_iommu_type1.c | 100 ++++++++++++++++++++++++++++++-- >> include/uapi/linux/vfio.h | 28 ++++++++- >> 2 files changed, 123 insertions(+), 5 deletions(-) >> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c >> index 77950e47f56f..d9c4a27b3c4e 100644 >> --- a/drivers/vfio/vfio_iommu_type1.c >> +++ b/drivers/vfio/vfio_iommu_type1.c >> @@ -78,6 +78,7 @@ struct vfio_iommu { >> bool v2; >> bool nesting; >> bool dirty_page_tracking; >> + bool dirty_log_manual_clear; >> bool pinned_page_dirty_scope; >> bool container_open; >> }; >> @@ -1242,6 +1243,78 @@ static int vfio_iommu_dirty_log_sync(struct vfio_iommu *iommu, >> return ret; >> } >> >> +static int vfio_iova_dirty_log_clear(u64 __user *bitmap, >> + struct vfio_iommu *iommu, >> + dma_addr_t iova, size_t size, >> + size_t pgsize) >> +{ >> + struct vfio_dma *dma; >> + struct rb_node *n; >> + dma_addr_t start_iova, end_iova, riova; >> + unsigned long pgshift = __ffs(pgsize); >> + unsigned long bitmap_size; >> + unsigned long *bitmap_buffer = NULL; >> + bool clear_valid; >> + int rs, re, start, end, dma_offset; >> + int ret = 0; >> + >> + bitmap_size = DIRTY_BITMAP_BYTES(size >> pgshift); >> + bitmap_buffer = kvmalloc(bitmap_size, GFP_KERNEL); >> + if (!bitmap_buffer) { >> + ret = -ENOMEM; >> + goto out; >> + } >> + >> + if (copy_from_user(bitmap_buffer, bitmap, bitmap_size)) { >> + ret = -EFAULT; >> + goto out; >> + } >> + >> + for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) { >> + dma = rb_entry(n, struct vfio_dma, node); >> + if (!dma->iommu_mapped) >> + continue; >> + if ((dma->iova + dma->size - 1) < iova) >> + continue; >> + if (dma->iova > iova + size - 1) >> + break; >> + >> + start_iova = max(iova, dma->iova); >> + end_iova = min(iova + size, dma->iova + dma->size); >> + >> + /* Similar logic as the tail of vfio_iova_dirty_bitmap */ >> + >> + clear_valid = false; >> + start = (start_iova - iova) >> pgshift; >> + end = (end_iova - iova) >> pgshift; >> + bitmap_for_each_set_region(bitmap_buffer, rs, re, start, end) { >> + clear_valid = true; >> + riova = iova + (rs << pgshift); >> + dma_offset = (riova - dma->iova) >> pgshift; >> + bitmap_clear(dma->bitmap, dma_offset, re - rs); >> + } >> + >> + if (clear_valid) >> + vfio_dma_populate_bitmap(dma, pgsize); >> + >> + if (clear_valid && !iommu->pinned_page_dirty_scope && >> + dma->iommu_mapped && !iommu->num_non_hwdbm_groups) { >> + ret = vfio_iommu_dirty_log_clear(iommu, start_iova, >> + end_iova - start_iova, bitmap_buffer, >> + iova, pgshift); >> + if (ret) { >> + pr_warn("dma dirty log clear failed!\n"); >> + goto out; >> + } >> + } >> + >> + } >> + >> +out: >> + kfree(bitmap_buffer); >> + return ret; >> +} >> + >> static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, >> struct vfio_dma *dma, dma_addr_t base_iova, >> size_t pgsize) >> @@ -1329,6 +1402,10 @@ static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu, >> if (ret) >> return ret; >> >> + /* Do not clear dirty automatically when manual_clear enabled */ >> + if (iommu->dirty_log_manual_clear) >> + continue; >> + >> /* Clear iommu dirty log to re-enable dirty log tracking */ >> if (iommu->num_non_pinned_groups && dma->iommu_mapped && >> !iommu->num_non_hwdbm_groups) { >> @@ -2946,6 +3023,11 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, >> if (!iommu) >> return 0; >> return vfio_domains_have_iommu_cache(iommu); >> + case VFIO_DIRTY_LOG_MANUAL_CLEAR: >> + if (!iommu) >> + return 0; >> + iommu->dirty_log_manual_clear = true; >> + return 1; >> default: >> return 0; >> } >> @@ -3201,7 +3283,8 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, >> struct vfio_iommu_type1_dirty_bitmap dirty; >> uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START | >> VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP | >> - VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP; >> + VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP | >> + VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP; >> unsigned long minsz; >> int ret = 0; >> >> @@ -3243,7 +3326,8 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, >> } >> mutex_unlock(&iommu->lock); >> return 0; >> - } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) { >> + } else if (dirty.flags & (VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP | >> + VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP)) { >> struct vfio_iommu_type1_dirty_bitmap_get range; >> unsigned long pgshift; >> size_t data_size = dirty.argsz - minsz; >> @@ -3286,13 +3370,21 @@ static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu, >> goto out_unlock; >> } >> >> - if (iommu->dirty_page_tracking) >> + if (!iommu->dirty_page_tracking) { >> + ret = -EINVAL; >> + goto out_unlock; >> + } >> + >> + if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) >> ret = vfio_iova_dirty_bitmap(range.bitmap.data, >> iommu, range.iova, >> range.size, >> range.bitmap.pgsize); >> else >> - ret = -EINVAL; >> + ret = vfio_iova_dirty_log_clear(range.bitmap.data, >> + iommu, range.iova, >> + range.size, >> + range.bitmap.pgsize); >> out_unlock: >> mutex_unlock(&iommu->lock); >> >> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h >> index 8ce36c1d53ca..784dc3cf2a8f 100644 >> --- a/include/uapi/linux/vfio.h >> +++ b/include/uapi/linux/vfio.h >> @@ -52,6 +52,14 @@ >> /* Supports the vaddr flag for DMA map and unmap */ >> #define VFIO_UPDATE_VADDR 10 >> >> +/* >> + * The vfio_iommu driver may support user clears dirty log manually, which means >> + * dirty log is not cleared automatically after dirty log is copied to userspace, >> + * it's user's duty to clear dirty log. Note: when user queries this extension >> + * and vfio_iommu driver supports it, then it is enabled. >> + */ >> +#define VFIO_DIRTY_LOG_MANUAL_CLEAR 11 >> + >> /* >> * The IOCTL interface is designed for extensibility by embedding the >> * structure length (argsz) and flags into structures passed between >> @@ -1188,7 +1196,24 @@ struct vfio_iommu_type1_dma_unmap { >> * actual bitmap. If dirty pages logging is not enabled, an error will be >> * returned. >> * >> - * Only one of the flags _START, _STOP and _GET may be specified at a time. >> + * Calling the IOCTL with VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP flag set, >> + * instructs the IOMMU driver to clear the dirty status of pages in a bitmap >> + * for IOMMU container for a given IOVA range. The user must specify the IOVA >> + * range, the bitmap and the pgsize through the structure >> + * vfio_iommu_type1_dirty_bitmap_get in the data[] portion. This interface >> + * supports clearing a bitmap of the smallest supported pgsize only and can be >> + * modified in future to clear a bitmap of any specified supported pgsize. The >> + * user must provide a memory area for the bitmap memory and specify its size >> + * in bitmap.size. One bit is used to represent one page consecutively starting >> + * from iova offset. The user should provide page size in bitmap.pgsize field. >> + * A bit set in the bitmap indicates that the page at that offset from iova is >> + * cleared the dirty status, and dirty tracking is re-enabled for that page. The >> + * caller must set argsz to a value including the size of structure >> + * vfio_iommu_dirty_bitmap_get, but excluing the size of the actual bitmap. If >> + * dirty pages logging is not enabled, an error will be returned. >> + * >> + * Only one of the flags _START, _STOP, _GET and _CLEAR may be specified at a >> + * time. >> * >> */ >> struct vfio_iommu_type1_dirty_bitmap { >> @@ -1197,6 +1222,7 @@ struct vfio_iommu_type1_dirty_bitmap { >> #define VFIO_IOMMU_DIRTY_PAGES_FLAG_START (1 << 0) >> #define VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP (1 << 1) >> #define VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP (1 << 2) >> +#define VFIO_IOMMU_DIRTY_PAGES_FLAG_CLEAR_BITMAP (1 << 3) >> __u8 data[]; >> }; >> > > . >