On Fri, 18 Mar 2022 14:27:32 -0300 Jason Gunthorpe <jgg@xxxxxxxxxx> wrote: > +/* > + * The area takes a slice of the pages from start_bytes to start_byte + length > + */ > +static struct iopt_area * > +iopt_alloc_area(struct io_pagetable *iopt, struct iopt_pages *pages, > + unsigned long iova, unsigned long start_byte, > + unsigned long length, int iommu_prot, unsigned int flags) > +{ > + struct iopt_area *area; > + int rc; > + > + area = kzalloc(sizeof(*area), GFP_KERNEL); > + if (!area) > + return ERR_PTR(-ENOMEM); > + > + area->iopt = iopt; > + area->iommu_prot = iommu_prot; > + area->page_offset = start_byte % PAGE_SIZE; > + area->pages_node.start = start_byte / PAGE_SIZE; > + if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) > + return ERR_PTR(-EOVERFLOW); > + area->pages_node.last = area->pages_node.last / PAGE_SIZE; > + if (WARN_ON(area->pages_node.last >= pages->npages)) > + return ERR_PTR(-EOVERFLOW); @area leaked in the above two error cases. > + > + down_write(&iopt->iova_rwsem); > + if (flags & IOPT_ALLOC_IOVA) { > + rc = iopt_alloc_iova(iopt, &iova, > + (uintptr_t)pages->uptr + start_byte, > + length); > + if (rc) > + goto out_unlock; > + } > + > + if (check_add_overflow(iova, length - 1, &area->node.last)) { > + rc = -EOVERFLOW; > + goto out_unlock; > + } > + > + if (!(flags & IOPT_ALLOC_IOVA)) { > + if ((iova & (iopt->iova_alignment - 1)) || > + (length & (iopt->iova_alignment - 1)) || !length) { > + rc = -EINVAL; > + goto out_unlock; > + } > + > + /* No reserved IOVA intersects the range */ > + if (interval_tree_iter_first(&iopt->reserved_iova_itree, iova, > + area->node.last)) { > + rc = -ENOENT; > + goto out_unlock; > + } > + > + /* Check that there is not already a mapping in the range */ > + if (iopt_area_iter_first(iopt, iova, area->node.last)) { > + rc = -EADDRINUSE; > + goto out_unlock; > + } > + } > + > + /* > + * The area is inserted with a NULL pages indicating it is not fully > + * initialized yet. > + */ > + area->node.start = iova; > + interval_tree_insert(&area->node, &area->iopt->area_itree); > + up_write(&iopt->iova_rwsem); > + return area; > + > +out_unlock: > + up_write(&iopt->iova_rwsem); > + kfree(area); > + return ERR_PTR(rc); > +} ... > +/** > + * iopt_access_pages() - Return a list of pages under the iova > + * @iopt: io_pagetable to act on > + * @iova: Starting IOVA > + * @length: Number of bytes to access > + * @out_pages: Output page list > + * @write: True if access is for writing > + * > + * Reads @npages starting at iova and returns the struct page * pointers. These > + * can be kmap'd by the caller for CPU access. > + * > + * The caller must perform iopt_unaccess_pages() when done to balance this. > + * > + * iova can be unaligned from PAGE_SIZE. The first returned byte starts at > + * page_to_phys(out_pages[0]) + (iova % PAGE_SIZE). The caller promises not to > + * touch memory outside the requested iova slice. > + * > + * FIXME: callers that need a DMA mapping via a sgl should create another > + * interface to build the SGL efficiently > + */ > +int iopt_access_pages(struct io_pagetable *iopt, unsigned long iova, > + unsigned long length, struct page **out_pages, bool write) > +{ > + unsigned long cur_iova = iova; > + unsigned long last_iova; > + struct iopt_area *area; > + int rc; > + > + if (!length) > + return -EINVAL; > + if (check_add_overflow(iova, length - 1, &last_iova)) > + return -EOVERFLOW; > + > + down_read(&iopt->iova_rwsem); > + for (area = iopt_area_iter_first(iopt, iova, last_iova); area; > + area = iopt_area_iter_next(area, iova, last_iova)) { > + unsigned long last = min(last_iova, iopt_area_last_iova(area)); > + unsigned long last_index; > + unsigned long index; > + > + /* Need contiguous areas in the access */ > + if (iopt_area_iova(area) < cur_iova || !area->pages) { ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Should this be (cur_iova != iova && iopt_area_iova(area) < cur_iova)? I can't see how we'd require in-kernel page users to know the iopt_area alignment from userspace, so I think this needs to skip the first iteration. Thanks, Alex > + rc = -EINVAL; > + goto out_remove; > + } > + > + index = iopt_area_iova_to_index(area, cur_iova); > + last_index = iopt_area_iova_to_index(area, last); > + rc = iopt_pages_add_user(area->pages, index, last_index, > + out_pages, write); > + if (rc) > + goto out_remove; > + if (last == last_iova) > + break; > + /* > + * Can't cross areas that are not aligned to the system page > + * size with this API. > + */ > + if (cur_iova % PAGE_SIZE) { > + rc = -EINVAL; > + goto out_remove; > + } > + cur_iova = last + 1; > + out_pages += last_index - index; > + atomic_inc(&area->num_users); > + } > + > + up_read(&iopt->iova_rwsem); > + return 0; > + > +out_remove: > + if (cur_iova != iova) > + iopt_unaccess_pages(iopt, iova, cur_iova - iova); > + up_read(&iopt->iova_rwsem); > + return rc; > +}