On 4/30/22 05:11, Baolu Lu wrote: > On 2022/4/29 05:09, Joao Martins wrote: >> Add an IO pagetable API iopt_read_and_clear_dirty_data() that >> performs the reading of dirty IOPTEs for a given IOVA range and >> then copying back to userspace from each area-internal bitmap. >> >> Underneath it uses the IOMMU equivalent API which will read the >> dirty bits, as well as atomically clearing the IOPTE dirty bit >> and flushing the IOTLB at the end. The dirty bitmaps pass an >> iotlb_gather to allow batching the dirty-bit updates. >> >> Most of the complexity, though, is in the handling of the user >> bitmaps to avoid copies back and forth. The bitmap user addresses >> need to be iterated through, pinned and then passing the pages >> into iommu core. The amount of bitmap data passed at a time for a >> read_and_clear_dirty() is 1 page worth of pinned base page >> pointers. That equates to 16M bits, or rather 64G of data that >> can be returned as 'dirtied'. The flush the IOTLB at the end of >> the whole scanned IOVA range, to defer as much as possible the >> potential DMA performance penalty. >> >> Signed-off-by: Joao Martins <joao.m.martins@xxxxxxxxxx> >> --- >> drivers/iommu/iommufd/io_pagetable.c | 169 ++++++++++++++++++++++++ >> drivers/iommu/iommufd/iommufd_private.h | 44 ++++++ >> 2 files changed, 213 insertions(+) >> >> diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c >> index f4609ef369e0..835b5040fce9 100644 >> --- a/drivers/iommu/iommufd/io_pagetable.c >> +++ b/drivers/iommu/iommufd/io_pagetable.c >> @@ -14,6 +14,7 @@ >> #include <linux/err.h> >> #include <linux/slab.h> >> #include <linux/errno.h> >> +#include <uapi/linux/iommufd.h> >> >> #include "io_pagetable.h" >> >> @@ -347,6 +348,174 @@ int iopt_set_dirty_tracking(struct io_pagetable *iopt, >> return ret; >> } >> >> +int iommufd_dirty_iter_init(struct iommufd_dirty_iter *iter, >> + struct iommufd_dirty_data *bitmap) >> +{ >> + struct iommu_dirty_bitmap *dirty = &iter->dirty; >> + unsigned long bitmap_len; >> + >> + bitmap_len = dirty_bitmap_bytes(bitmap->length >> dirty->pgshift); >> + >> + import_single_range(WRITE, bitmap->data, bitmap_len, >> + &iter->bitmap_iov, &iter->bitmap_iter); >> + iter->iova = bitmap->iova; >> + >> + /* Can record up to 64G at a time */ >> + dirty->pages = (struct page **) __get_free_page(GFP_KERNEL); >> + >> + return !dirty->pages ? -ENOMEM : 0; >> +} >> + >> +void iommufd_dirty_iter_free(struct iommufd_dirty_iter *iter) >> +{ >> + struct iommu_dirty_bitmap *dirty = &iter->dirty; >> + >> + if (dirty->pages) { >> + free_page((unsigned long) dirty->pages); >> + dirty->pages = NULL; >> + } >> +} >> + >> +bool iommufd_dirty_iter_done(struct iommufd_dirty_iter *iter) >> +{ >> + return iov_iter_count(&iter->bitmap_iter) > 0; >> +} >> + >> +static inline unsigned long iommufd_dirty_iter_bytes(struct iommufd_dirty_iter *iter) >> +{ >> + unsigned long left = iter->bitmap_iter.count - iter->bitmap_iter.iov_offset; >> + >> + left = min_t(unsigned long, left, (iter->dirty.npages << PAGE_SHIFT)); >> + >> + return left; >> +} >> + >> +unsigned long iommufd_dirty_iova_length(struct iommufd_dirty_iter *iter) >> +{ >> + unsigned long left = iommufd_dirty_iter_bytes(iter); >> + >> + return ((BITS_PER_BYTE * left) << iter->dirty.pgshift); >> +} >> + >> +unsigned long iommufd_dirty_iova(struct iommufd_dirty_iter *iter) >> +{ >> + unsigned long skip = iter->bitmap_iter.iov_offset; >> + >> + return iter->iova + ((BITS_PER_BYTE * skip) << iter->dirty.pgshift); >> +} >> + >> +void iommufd_dirty_iter_advance(struct iommufd_dirty_iter *iter) >> +{ >> + iov_iter_advance(&iter->bitmap_iter, iommufd_dirty_iter_bytes(iter)); >> +} >> + >> +void iommufd_dirty_iter_put(struct iommufd_dirty_iter *iter) >> +{ >> + struct iommu_dirty_bitmap *dirty = &iter->dirty; >> + >> + if (dirty->npages) >> + unpin_user_pages(dirty->pages, dirty->npages); >> +} >> + >> +int iommufd_dirty_iter_get(struct iommufd_dirty_iter *iter) >> +{ >> + struct iommu_dirty_bitmap *dirty = &iter->dirty; >> + unsigned long npages; >> + unsigned long ret; >> + void *addr; >> + >> + addr = iter->bitmap_iov.iov_base + iter->bitmap_iter.iov_offset; >> + npages = iov_iter_npages(&iter->bitmap_iter, >> + PAGE_SIZE / sizeof(struct page *)); >> + >> + ret = pin_user_pages_fast((unsigned long) addr, npages, >> + FOLL_WRITE, dirty->pages); >> + if (ret <= 0) >> + return -EINVAL; >> + >> + dirty->npages = ret; >> + dirty->iova = iommufd_dirty_iova(iter); >> + dirty->start_offset = offset_in_page(addr); >> + return 0; >> +} >> + >> +static int iommu_read_and_clear_dirty(struct iommu_domain *domain, >> + struct iommufd_dirty_data *bitmap) > > This looks more like a helper in the iommu core. How about > > iommufd_read_clear_domain_dirty()? > Heh, I guess that's more accurate naming indeed. I can switch to that. >> +{ >> + const struct iommu_domain_ops *ops = domain->ops; >> + struct iommu_iotlb_gather gather; >> + struct iommufd_dirty_iter iter; >> + int ret = 0; >> + >> + if (!ops || !ops->read_and_clear_dirty) >> + return -EOPNOTSUPP; >> + >> + iommu_dirty_bitmap_init(&iter.dirty, bitmap->iova, >> + __ffs(bitmap->page_size), &gather); >> + ret = iommufd_dirty_iter_init(&iter, bitmap); >> + if (ret) >> + return -ENOMEM; >> + >> + for (; iommufd_dirty_iter_done(&iter); >> + iommufd_dirty_iter_advance(&iter)) { >> + ret = iommufd_dirty_iter_get(&iter); >> + if (ret) >> + break; >> + >> + ret = ops->read_and_clear_dirty(domain, >> + iommufd_dirty_iova(&iter), >> + iommufd_dirty_iova_length(&iter), &iter.dirty); >> + >> + iommufd_dirty_iter_put(&iter); >> + >> + if (ret) >> + break; >> + } >> + >> + iommu_iotlb_sync(domain, &gather); >> + iommufd_dirty_iter_free(&iter); >> + >> + return ret; >> +} >> + >> +int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, >> + struct iommu_domain *domain, >> + struct iommufd_dirty_data *bitmap) >> +{ >> + unsigned long iova, length, iova_end; >> + struct iommu_domain *dom; >> + struct iopt_area *area; >> + unsigned long index; >> + int ret = -EOPNOTSUPP; >> + >> + iova = bitmap->iova; >> + length = bitmap->length - 1; >> + if (check_add_overflow(iova, length, &iova_end)) >> + return -EOVERFLOW; >> + >> + down_read(&iopt->iova_rwsem); >> + area = iopt_find_exact_area(iopt, iova, iova_end); >> + if (!area) { >> + up_read(&iopt->iova_rwsem); >> + return -ENOENT; >> + } >> + >> + if (!domain) { >> + down_read(&iopt->domains_rwsem); >> + xa_for_each(&iopt->domains, index, dom) { >> + ret = iommu_read_and_clear_dirty(dom, bitmap); > > Perhaps use @domain directly, hence no need the @dom? > > xa_for_each(&iopt->domains, index, domain) { > ret = iommu_read_and_clear_dirty(domain, bitmap); > Yeap. >> + if (ret) >> + break; >> + } >> + up_read(&iopt->domains_rwsem); >> + } else { >> + ret = iommu_read_and_clear_dirty(domain, bitmap); >> + } >> + >> + up_read(&iopt->iova_rwsem); >> + return ret; >> +} >> + >> struct iopt_pages *iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, >> unsigned long *start_byte, >> unsigned long length) >> diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h >> index d00ef3b785c5..4c12b4a8f1a6 100644 >> --- a/drivers/iommu/iommufd/iommufd_private.h >> +++ b/drivers/iommu/iommufd/iommufd_private.h >> @@ -8,6 +8,8 @@ >> #include <linux/xarray.h> >> #include <linux/refcount.h> >> #include <linux/uaccess.h> >> +#include <linux/iommu.h> >> +#include <linux/uio.h> >> >> struct iommu_domain; >> struct iommu_group; >> @@ -49,8 +51,50 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, >> unsigned long length); >> int iopt_unmap_all(struct io_pagetable *iopt); >> >> +struct iommufd_dirty_data { >> + unsigned long iova; >> + unsigned long length; >> + unsigned long page_size; >> + unsigned long *data; >> +}; > > How about adding some comments around this struct? Any alingment > requirement for iova/length? What does the @data stand for? > I'll add them. Albeit this structure eventually gets moved to iommu-core later in the series when we add the UAPI and it has some comments documenting it. I don't cover the the alignment though, but it's the same restrictions as IOAS map/unmap (iopt_alignment essentially) which is the smaller-page-size supported by IOMMU hw. >> + >> int iopt_set_dirty_tracking(struct io_pagetable *iopt, >> struct iommu_domain *domain, bool enable); >> +int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, >> + struct iommu_domain *domain, >> + struct iommufd_dirty_data *bitmap); >> + >> +struct iommufd_dirty_iter { >> + struct iommu_dirty_bitmap dirty; >> + struct iovec bitmap_iov; >> + struct iov_iter bitmap_iter; >> + unsigned long iova; >> +}; > > Same here. > Yes, this one deserves some comments. Most of it is state for gup/pup and iterating the bitmap user addresses to make iommu_dirty_bitmap_record() work only with kva. >> + >> +void iommufd_dirty_iter_put(struct iommufd_dirty_iter *iter); >> +int iommufd_dirty_iter_get(struct iommufd_dirty_iter *iter); >> +int iommufd_dirty_iter_init(struct iommufd_dirty_iter *iter, >> + struct iommufd_dirty_data *bitmap); >> +void iommufd_dirty_iter_free(struct iommufd_dirty_iter *iter); >> +bool iommufd_dirty_iter_done(struct iommufd_dirty_iter *iter); >> +void iommufd_dirty_iter_advance(struct iommufd_dirty_iter *iter); >> +unsigned long iommufd_dirty_iova_length(struct iommufd_dirty_iter *iter); >> +unsigned long iommufd_dirty_iova(struct iommufd_dirty_iter *iter); >> +static inline unsigned long dirty_bitmap_bytes(unsigned long nr_pages) >> +{ >> + return (ALIGN(nr_pages, BITS_PER_TYPE(u64)) / BITS_PER_BYTE); >> +} >> + >> +/* >> + * Input argument of number of bits to bitmap_set() is unsigned integer, which >> + * further casts to signed integer for unaligned multi-bit operation, >> + * __bitmap_set(). >> + * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte, >> + * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page >> + * system. >> + */ >> +#define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) >> +#define DIRTY_BITMAP_SIZE_MAX dirty_bitmap_bytes(DIRTY_BITMAP_PAGES_MAX) >> >> int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova, >> unsigned long npages, struct page **out_pages, bool write); > > Best regards, > baolu